class NanGroupedModel(BaseEstimator, RegressorMixin): def __init__(self, estimator): self.estimator = estimator self.cluster = KMeans(n_clusters=2) self.models = {} def fit(self, X, y, **kwargs): nans = X.isnull() clusters = self.cluster.fit_transform(nans).argmin(axis=1) for name in np.unique(clusters): filt = clusters == name self.models[name] = clone(self.estimator).fit( X[filt], y[filt], **kwargs) return self def predict(self, X, **kwargs): preds = np.zeros(X.shape[0]) nans = X.isnull() clusters = self.cluster.transform(nans).argmax(axis=1) for name in np.unique(clusters): filt = clusters == name preds[filt] = self.models[name].predict(X[filt], **kwargs) return preds def predict_proba(self, X, **kwargs): preds = np.zeros(X.shape[0]) nans = X.isnull() clusters = self.cluster.transform(nans).argmax(axis=1) for name in np.unique(clusters): filt = clusters == name preds[filt] = self.models[name].predict(X[filt], **kwargs) return preds
def train_and_test(self, training_data, test_data, break_point): if not self.number_of_clusters: self.number_of_clusters = K_Means.find_optimal_K( training_data, min_number_of_clusters=2, max_number_of_clusters=11) # Aplica o KMEANS na base de dados de treino kmeans = KMeans(n_clusters=self.number_of_clusters, random_state=0).fit(training_data) # self.plot_clusters(kmeans, training_data) # ACHAR MENORES DISTANCIAS PARA OS DADOS DE TREINO train_dist = kmeans.transform(training_data) train_min_dist = np.zeros(len(training_data)) for i in range(len(train_dist)): train_min_dist[i] = min(train_dist[i]) train_min_dist.sort() # ACHAR MENORES DISTANCIAS PARA OS DADOS DE TESTE test_dist = kmeans.transform(test_data) test_min_dist = np.zeros(len(test_dist)) for i in range(len(test_dist)): test_min_dist[i] = min(test_dist[i]) self.DIs = test_min_dist self._train_DIs = train_min_dist self._data_break_point = break_point self._set_resulting_parameters()
def fit_prompt_type_model(model, n_types, random_state=None, max_dist=0.9, verbosity=0): """ Standalone function that fits a prompt type model given paired prompt and response inputs. See docstring of the `PromptTypes` class for details. :param model: prompt embedding model (from `fit_prompt_embedding_model()`) :param n_types: number of prompt types to infer :return: prompt type model """ if verbosity > 0: print('fitting %d prompt types' % n_types) km = KMeans(n_clusters=n_types, random_state=random_state) km.fit(model['U_prompt']) prompt_dists = km.transform(model['U_prompt']) prompt_clusters = km.predict(model['U_prompt']) prompt_clusters[prompt_dists.min(axis=1) >= max_dist] = -1 reference_dists = km.transform(model['U_reference']) reference_clusters = km.predict(model['U_reference']) reference_clusters[reference_dists.min(axis=1) >= max_dist] = -1 prompt_df = pd.DataFrame(index=model['prompt_tfidf_model'].get_feature_names(), data=np.hstack([prompt_dists, prompt_clusters[:,np.newaxis]]), columns=list(range(n_types)) + ['type_id']) reference_df = pd.DataFrame(index=model['reference_tfidf_model'].get_feature_names(), data=np.hstack([reference_dists, reference_clusters[:,np.newaxis]]), columns=list(range(n_types)) + ['type_id']) return {'km_model': km, 'prompt_df': prompt_df, 'reference_df': reference_df}
def gapstat(self, ref_size=10, max_iter=300, n_init=3): Wkestrand = np.zeros(len(self.range)) Wk = np.zeros(len(self.range)) sk = np.zeros(len(self.range)) sample = self.randomData(ref_size) for indk, k in enumerate(self.range): km = KMeans(n_clusters=k, init='k-means++', max_iter=max_iter, n_init=n_init) Wkrand = [] for i in range(ref_size): km.fit(sample[i]) SS = km.transform(sample[i]) Wkrand.append((self.intraDist(km.labels_.tolist(), k, km.cluster_centers_))) Wkestrand[indk] = (1/ref_size)*sum(Wkrand) km.fit(self.X) XX = km.transform(self.X) clusters = km.labels_.tolist() Wk[indk] = self.intraDist(clusters, k, km.cluster_centers_) sk[indk] = np.sqrt((1/ref_size)*sum([(Wkrand[i]-Wkestrand[indk])**2 for i in range(ref_size)])) sk *= np.sqrt(1+1/ref_size) Gapk = [(1/ref_size)*Wkestrand[i]-Wk[i] for i in range(len(self.range))] #return min([k for k, j in enumerate([Gapk[g]-Gapk[g+1]+sk[g+1] for g in self.range[:,-1]]) if j>0 ]) return [(k, Gapk[j], Gapk[j]-Gapk[j+1]+sk[j+1])for j, k in enumerate(self.range[:-1])]
def transform(self, X): """ Computes the predictions. @param X features. @return prediction """ if self.weights_ is None: if self.balanced_predictions: labels, distances, __ = constraint_predictions( X, self.cluster_centers_, strategy=self.strategy) # We remove small distances than the chosen clusters # due to the constraint, we choose max*2 instead. mx = distances.max() * 2 for i, l in enumerate(labels): mi = distances[i, l] mmi = distances[i, :].min() if mi > mmi: # numpy.nan would be best distances[i, distances[i, :] < mi] = mx return distances return KMeans.transform(self, X) else: if self.balanced_predictions: raise RuntimeError( # pragma: no cover "balanced_predictions and weights_ cannot be used together." ) res = KMeans.transform(self, X) res *= self.weights_.reshape((1, -1)) return res
def __cluster_items(item_embed): # index = np.arange(len(item_embed)) # data = item_embed[index] kmeans = KMeans(n_clusters=2, random_state=2020).fit(item_embed) labels = kmeans.labels_ left_index = np.where(labels == 0)[0] right_index = np.where(labels == 1)[0] # left_index = index[l_i] # right_index = index[r_i] if len(right_index) - len(left_index) > 1: distances = kmeans.transform(item_embed[right_index])[:, 1] rank = np.argsort(distances)[::-1] idx = np.concatenate((left_index, right_index[rank])) mid = len(idx) // 2 left_index = idx[:mid] right_index = idx[mid:] # left_index, right_index = Tree.rebalance( # left_index, right_index, distances[:, 1]) elif len(left_index) - len(right_index) > 1: distances = kmeans.transform(item_embed[left_index])[:, 0] rank = np.argsort(distances) idx = np.concatenate((left_index[rank], right_index)) mid = len(idx) // 2 left_index = idx[:mid] right_index = idx[mid:] # left_index, right_index = Tree.rebalance( # right_index, left_index, distances[:, 0]) return left_index, right_index, kmeans.cluster_centers_[ 0], kmeans.cluster_centers_[1]
def get_mmd(perc_value, y_scores, train_1, test_1, y_true, train_ind, test_path, bandwidth): abn_idx = np.where(y_scores < np.percentile(y_scores, perc_value)) abn_tst_latent = test_1[abn_idx] kmeans = KMeans(n_clusters=1, random_state=0).fit(abn_tst_latent) train_1_prime = np.concatenate((train_1, kmeans.transform(train_1)), axis=1) test_1_prime = np.concatenate((test_1, kmeans.transform(test_1)), axis=1) cf = svm.OneClassSVM(gamma='scale', nu=0.1) cf.fit(train_1_prime[train_ind, :]) y_scores_tmp_grid = cf.score_samples(test_1_prime) y_scores_tmp_grid = (y_scores_tmp_grid - min(y_scores_tmp_grid)) / ( max(y_scores_tmp_grid) - min(y_scores_tmp_grid)) auroc = metrics.roc_auc_score(y_true, y_scores_tmp_grid) auprc = metrics.average_precision_score(y_true, y_scores_tmp_grid) np.save(test_path + '/svm_aucroc1_grid_' + str(perc_value) + '.npy', auroc) np.save(test_path + '/svm_aucprc1_grid_' + str(perc_value) + '.npy', auprc) abn_idx_left = np.where(y_scores < np.percentile(y_scores, 5)) abn_idx_right = np.where(y_scores >= np.percentile(y_scores, 80)) abn_idx_current = np.where( (y_scores >= np.percentile(y_scores, perc_value)) & (y_scores < np.percentile(y_scores, perc_value + 5))) mmd_ind_left = np.random.choice(np.squeeze(np.array(abn_idx_left)), 500, replace=False) X_mmd_left = test_1[mmd_ind_left] np.save( test_path + '/y_true_x_mmd_left_ind_grid_' + str(perc_value) + '.npy', y_true[mmd_ind_left]) mmd_ind_right = np.random.choice(np.squeeze(np.array(abn_idx_right)), 500, replace=False) X_mmd_right = test_1[mmd_ind_right] np.save( test_path + '/y_true_x_mmd_right_ind_grid_' + str(perc_value) + '.npy', y_true[mmd_ind_right]) mmd_ind_current = np.random.choice(np.squeeze(np.array(abn_idx_current)), 500, replace=False) X_mmd_current = test_1[mmd_ind_current] np.save( test_path + '/y_true_x_mmd_current_ind_grid_' + str(perc_value) + '.npy', y_true[mmd_ind_current]) mmd_output_left = rbf_mmd2(X_mmd_left, X_mmd_current, sigma=bandwidth, biased=True) mmd_output_right = rbf_mmd2(X_mmd_right, X_mmd_current, sigma=bandwidth, biased=True) np.save(test_path + '/mmd_grid_left_' + str(perc_value) + '.npy', mmd_output_left) np.save(test_path + '/mmd_grid_right_' + str(perc_value) + '.npy', mmd_output_right) return (mmd_output_left, mmd_output_right)
def getCluster(train_df,test_df,k): train_test = pd.concat([train_df.drop('interest_level',axis=1),test_df]) #processMap(train_test) cluster = KMeans(k,random_state = 2333) cluster.fit(train_test[['latitude', 'longitude']].dropna()) train_df['cluster_id_'+str(k)]=cluster.predict(train_df[['latitude', 'longitude']].fillna(-1)) test_df['cluster_id_'+str(k)]=cluster.predict(test_df[['latitude', 'longitude']].fillna(-1)) train_df['cluster_id_'+str(k)+'_d']=np.amin(cluster.transform(train_df[['latitude', 'longitude']]),axis=1) test_df['cluster_id_'+str(k)+'_d']=np.amin(cluster.transform(test_df[['latitude', 'longitude']]),axis=1)
def best_lda_cluster_spam(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/spam_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/spam_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def cluster_driver(a_driver): # print a_driver['DStats'] # print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################" # sys.stdout = open('a_projpath' +'output.txt','w') # print a_driver['DStats'] X = StandardScaler().fit_transform(a_driver['DStats']) # print X # print "DStats are.....::" , a_driver['DStats'] # print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X # print "############################Scaled X Above###################################################" pca = PCA(n_components=5) Xpca = pca.fit(X).transform(X) if plotflag == True: fig = scatterplot_matrix(np.transpose(Xpca) , ['PC1' , 'PC2' , 'PC3' , 'PC4' # ,'PC5' ] ,linestyle='none', marker='o', color='black', mfc='none') fig.suptitle('Simple Scatterplot Matrix') plt.show() db = KMeans(n_clusters=1,n_jobs = -1).fit(Xpca) # db = DBSCAN(eps=0.5).fit(Xpca) # core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print "###############################################################################" # print('Estimated number of clusters: %d' % n_clusters_) # print 'Count of Predicts::', len(X) # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(Xpca, labels)) print "% Variance Explaned: %0.3f" , sum(pca.explained_variance_ratio_) # print "##############################DBSCAN X Below#################################################" # print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/' # try: return (1- (db.transform(Xpca)/max(db.transform(Xpca))))
def AnalyzeGraphs(filename): """ Try to generate a graph adjancy matrix, using the edges in each path. Then use the matrix to obtain its eigenvectors and then calculate the sum vector of the eigenvectors, as a codifier for the graph, Fill an array of the resultant vector """ pd_database = pd.read_csv(filename) nodes_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] eigenvectors = [] for path in pd_database['Steps']: path = ast.literal_eval(path) eig_vec = GetVectorID(path, nodes_list) eig_vec_flat = np.array(eig_vec).flatten() #if len(eig_vec) < 10: #print(eig_vec_flat) eigenvectors.append(eig_vec_flat) print('*** Clusteriing %d graphs by K-Means ***' % len(eigenvectors)) np_eigenvectors = np.array(eigenvectors) #print(np_eigenvectors.shape()) #print('Size:', np_eigenvectors.size()) kmeans_2 = KMeans(n_clusters=2, random_state=0).fit(np_eigenvectors) kmeans_3 = KMeans(n_clusters=3, random_state=0).fit(np_eigenvectors) kmeans_4 = KMeans(n_clusters=4, random_state=0).fit(np_eigenvectors) kmeans_5 = KMeans(n_clusters=5, random_state=0).fit(np_eigenvectors) data_new_2 = kmeans_2.transform(np_eigenvectors) data_new_3 = kmeans_3.transform(np_eigenvectors) data_new_4 = kmeans_4.transform(np_eigenvectors) data_new_5 = kmeans_5.transform(np_eigenvectors) with open('k-means2_out.txt', 'w') as outfile2: np.savetxt(outfile2, data_new_2, fmt='%4.1f') with open('k-means3_out.txt', 'w') as outfile3: np.savetxt(outfile3, data_new_3, fmt='%4.1f') with open('k-means4_out.txt', 'w') as outfile4: np.savetxt(outfile4, data_new_4, fmt='%4.1f') with open('k-means5_out.txt', 'w') as outfile5: np.savetxt(outfile5, data_new_5, fmt='%4.1f') outfile2.close() outfile3.close() outfile4.close() outfile5.close() PlotDistanceAngle(np_eigenvectors, kmeans_2.cluster_centers_, data_new_2)
def preprocess(): train = pd.read_csv(BASE_PATH + "train.csv").drop( ['id'], axis=1) # ["description"] test = pd.read_csv(BASE_PATH + "test.csv").drop(["id"], axis=1) # ["description"] sentences = pd.concat([train["description"], test["description"]]) tokenizer = Tokenizer( num_words=2000, lower=True, ) # 出現頻度上位{num_words}だけを用いる tokenizer.fit_on_texts(sentences) train_X, test_X = np.split(tokenizer.texts_to_matrix(sentences, mode='binary'), [len(train)], axis=0) word_vectorizer = TfidfVectorizer( sublinear_tf=True, analyzer="char", stop_words="english", ngram_range=(2, 6), max_features=1000, ) word_vectorizer.fit(sentences) # print(train_X) # print((word_vectorizer.transform(train["description"])).toarray()) train_X = np.concatenate( [train_X, (word_vectorizer.transform(train["description"])).toarray()], 1) test_X = np.concatenate( [test_X, (word_vectorizer.transform(test["description"])).toarray()], 1) text_svd = TruncatedSVD(n_components=100, algorithm="arpack", random_state=1234) text_svd.fit(train_X) train_X = text_svd.transform(train_X) test_X = text_svd.transform(test_X) kmeans = KMeans(n_clusters=100, random_state=10).fit(np.concatenate([train_X, test_X])) train_X = np.concatenate([train_X, (kmeans.transform(train_X))], 1) test_X = np.concatenate([test_X, (kmeans.transform(test_X))], 1) train_y = train['jobflag'].values - 1 # maps {1, 2, 3 ,4} -> {0, 1, 2, 3} return train_X, train_y, test_X
def cluster_feature_nn(df, X, y, clusters=2): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) kmeans_model = KMeans(n_clusters=clusters, random_state=100).fit(X_train) features_train = kmeans_model.transform(X_train) features_test = kmeans_model.transform(X_test) nn_model = MLPClassifier(hidden_layer_sizes=(20, 20), activation='relu', max_iter=700) nn_model.fit(features_train, y_train) train_score = nn_model.score(features_test, y_test) test_score = nn_model.score(features_train, y_train) return train_score, test_score
def best_lda_cluster_wine(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def kmeans(data, model_id, x_col, n_clusters): # |Create model, fit data, and return prediction of cluster for each row model = KMeans(n_clusters) # |Add distance to each cluster for each row to summary data headers = [] for i in range(n_clusters): headers.append('dist_%s' % str(i)) dist = pd.DataFrame(model.transform(data.x), columns=headers) data.current_df = data.current_df.join(dist) data.df['kmeans']['data'] = data.df['kmeans']['data'].append(data.current_df, ignore_index=True) # |Create DataFrame with each cluster and the mean value for each input column df = pd.DataFrame() for i in range(n_clusters): clus = {'cluster':i} for j in range(len(x_col)): clus['%s_mean' % x_col[j]] = model.cluster_centers_[i][j] df = df.append(clus, ignore_index=True) df['model_id'] = model_id data.df['kmeans']['clusters'] = data.df['kmeans']['clusters'].append(df, ignore_index=True) return data, model
class KMeansClustering(Transform): """KMeans clustering Uses the KMeans implementation of sklearn. """ base_name = "kmeans" def __init__(self, config): """Kmeans-clustering constructor""" Transform.__init__(self, config) self.transformer = KMeans(self.dimension) self.process_func_train = self.fit self.process_func_test = self.do_transform def fit(self, data): self.transformer = self.transformer.fit(data) return self.do_transform(data) def do_transform(self, data): res = self.transformer.transform(data) return res def get_term_representations(self): """Return term-based, rather than document-based representations """ return self.transformer.cluster_centers_
def components(K): Sum_of_squared_distances = [] k = [] accuracy_train = [] accuracy_test = [] score = [] for i in range(1, K): print(i) agglo = KMeans(n_clusters=i) #X_new_train,y_new_train=transformer.fit(X_train,y_train) #X_new_test,y_new_test = transformer.transform(X_test,y_test) agglo.fit(X) X_reduced = agglo.transform(X) X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.20) km = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=[8, 8, 8, 8, 8], random_state=1) km.fit(X_train, y_train) km.fit(X_test, y_test) #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5) #transformer2 = GaussianRandomProjection(n_compo label_train = km.predict(X_train) label_test = km.predict(X_test) accu_train = km.score(X_test, y_test) accu_test = km.score(X_train, y_train) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') #Sum_of_squared_distances.append(km.inenents=i,eps=0.6) #label=transformer.predicn)rtia_) k.append(i) accuracy_train.append(accu_train) accuracy_test.append(accu_test) #score.append(score_train1) #print(accuracy) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) accuracy_train = np.array(accuracy_train) accuracy_test = np.asarray(accuracy_test) #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o') #line2,=plt.plot(k,score,color='g',marker='o') line3, = plt.plot(k, accuracy_train, color='r', marker='o', label='train_accuracy') line4, = plt.plot(k, accuracy_test, color='g', marker='o', label='test_accuracy') #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.xlabel('k') plt.legend() plt.ylabel('accuracy') #plt.ylim(0,1) plt.show() return None
def test_basic(self, Xl_blobs_easy): X, _ = Xl_blobs_easy # make it super easy to cluster a = DKKMeans(n_clusters=3, random_state=0) b = SKKMeans(n_clusters=3, random_state=0) a.fit(X) b.fit(X) assert_estimator_equal( a, b, exclude=["n_iter_", "inertia_", "cluster_centers_", "labels_"]) assert abs(a.inertia_ - b.inertia_) < 0.01 # order is arbitrary, so align first a_order = np.argsort(a.cluster_centers_, 0)[:, 0] b_order = np.argsort(b.cluster_centers_, 0)[:, 0] a_centers = a.cluster_centers_[a_order] b_centers = b.cluster_centers_[b_order] np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3) b_labels = replace(b.labels_, [0, 1, 2], a_order[b_order]).astype(b.labels_.dtype) assert_eq(a.labels_.compute(), b_labels) assert a.n_iter_ # this is hacky b.cluster_centers_ = b_centers a.cluster_centers_ = a_centers assert_eq(a.transform(X), b.transform(X), rtol=1e-3) yhat_a = a.predict(X) yhat_b = b.predict(X) assert_eq(yhat_a.compute(), yhat_b)
def kmean_data(tune_path=None, test_path=None, cluster=3, isPCA=True): ''' :param tune_path: src of a tuning data set :param test_path: src of a testing data set :return: tuning data after clustering, in the form of [indep val, depen val] ''' def find_min(a): return a.min() if not tune_path: tune_path = "./data/ant/ant-1.4.csv" if not test_path: test_path = "./data/ant/ant-1.5.csv" df_tune = get_data(tune_path, "tune") df_test = get_data(test_path, "test") if isPCA: tune_x, tune_y = pca_analysis(df_tune) test_x, test_y = pca_analysis(df_test) else: tune_x, tune_y = get_xy(df_tune, normalize=True) test_x, test_y = get_xy(df_test, normalize=True) # tune_x, tune_y = get_xy(df_tune, normalize=True) # test_x, test_y = get_xy(df_test, normalize=True) kmean = KMeans(n_clusters=cluster).fit( test_x) ## use testing data to do clustering avg_distance = kmean.inertia_ / float(len(test_x)) tune_distance = kmean.transform(tune_x) min_distance = np.apply_along_axis(find_min, 1, tune_distance) pick_index = min_distance < avg_distance * 2 # find tuning data whose # all distance to cluster center is less than avg_distance normal_tune_x, normal_tune_y = get_xy(df_tune, normalize=False) _tune_x, _tune_y = normal_tune_x[pick_index], normal_tune_y[pick_index] return [_tune_x, _tune_y]
def recommend(X, x, user, threshold=5): model = KMeans(n_clusters=2, random_state=0) print("X ", X) model.fit(X[:, 1:]) frame = pd.DataFrame(X) frame['cluster'] = model.predict(X[:, 1:]) k = model.predict(user.reshape(1, -1)[:, 1:]) print("K", k) frame.columns = [ 'User_id', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'cluster' ] print(frame) data = frame[frame["cluster"] == k[0]].iloc[:, 1:11] print(" ", data) distances = model.transform(data) required_distances = distances[:, k].reshape(distances.shape[0]) idx = (-required_distances).argsort()[:threshold] print(required_distances) # return user_name with the help of idx print(len(data.iloc[idx, :].index)) print("X", x) list_recommended_users = [] list_index = [] for i in range(len(data.iloc[idx, :].index)): index = data.iloc[idx, :].index[i] list_index.append(index) print("series", frame.iloc[index, 0]) list_recommended_users.append(int(x[index, 0])) print(list_recommended_users) return list_recommended_users, list_index # recommend(X,y,model_kmeans,0,2)
def init_cluster(word_vectors): print(word_vectors.vectors) model = KMeans(n_clusters=3, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors) labels = model.labels_ silhouette_score = metrics.silhouette_score(word_vectors.vectors, labels, metric='euclidean') print("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):") print(model.score(word_vectors.vectors)) print("Silhouette_score: ") print(silhouette_score) print(word_vectors.similar_by_vector(model.cluster_centers_[0], topn=50, restrict_vocab=None)) print(word_vectors.similar_by_vector(model.cluster_centers_[1], topn=50, restrict_vocab=None)) print(word_vectors.similar_by_vector(model.cluster_centers_[2], topn=50, restrict_vocab=None)) y_kmeans = model.predict(word_vectors.vectors) plt.scatter(word_vectors.vectors[:, 0], word_vectors.vectors[:, 1], c=y_kmeans, s=50, cmap='viridis') centers = model.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5) plt.show() words = pd.DataFrame(word_vectors.vocab.keys()) words.columns = ['words'] words = words[words['words'].str.len() > 3].reset_index(drop=True) words['vectors'] = words['words'].apply(lambda x: word_vectors.wv[f'{x}']) words['cluster'] = words['vectors'].apply(lambda x: model.predict([np.array(x)])) words['cluster'] = words['cluster'].apply(lambda x: x[0]) words['cluster_value'] = [1 if i == 0 else -1 if i == 1 else 0 for i in words['cluster']] words['closeness_score'] = words.apply(lambda x: 1 / (model.transform([x['vectors']]).min()), axis=1) words['sentiment_coeff'] = words['closeness_score'] * words['cluster_value'] words.to_csv('metrics_results\\predictive_scores_{}.csv'.format(time_stamp), index=False) return words
def get_features(nb_peaks, X, Y): data = np.empty((1, 10)) kmeans = KMeans(n_clusters=nb_peaks, random_state=1).fit(X) sample_silhouette_values = silhouette_samples(X, kmeans.labels_) a = kmeans.transform(X) a = np.take_along_axis(a, kmeans.labels_.reshape(-1, 1), axis=1) indexes = np.sort(np.unique(kmeans.labels_, return_index=True)[1]) for unique, center in zip(kmeans.labels_[indexes], np.sort(kmeans.cluster_centers_, axis=0).ravel()): group = a[kmeans.labels_ == unique] group_y = Y[kmeans.labels_ == unique] group_x = X[kmeans.labels_ == unique] sil = sample_silhouette_values[kmeans.labels_ == unique].mean() sample = np.array([ group.std(), group.shape[0], center, group_x.mean(), sil, group.sum(), group_x.std(), group.mean(), group_y.mean(), group_y.std() ]).reshape(1, 10) data = np.concatenate((data, sample), axis=0) data = data[1:, :] centers = np.sort(kmeans.cluster_centers_.ravel()).reshape(1, -1) return data, centers
def sample_tag_with_kmeans(self, batch, topk_tag=1000, n_cluster=3): src_inputs = batch.src[0] src_lengths = batch.src[1].tolist() context, enc_states = self.model.encode(src_inputs, src_lengths) sampler_output = self.model.sample_tag(src_inputs, src_lengths) sampler_output.data[0] = -1e20 tag_log_probs = sampler_output selected_tag_score, selected_tag_pos = tag_log_probs.data.topk( topk_tag, dim=-1) # selected_tag = tag_inputs[selected_tag_pos[-1]].unsqueeze(-1) tag_hidden = self.model.tag_encode( Variable(selected_tag_pos).unsqueeze(0)).squeeze(0) tag_c = self.model.seq2seq.decoder.cal_tag_atten(tag_hidden, context) tag_c = tag_c.data.cpu().numpy() clf = KMeans(n_clusters=n_cluster, init='k-means++', max_iter=300) clf.fit(tag_c) distance = clf.transform(tag_c) np_topk_tag_idx = selected_tag_pos.tolist() np_topk_tag_score = selected_tag_score.tolist() clusters = [[] for _ in range(n_cluster)] for i, (data, log_prob, c) in enumerate(zip(np_topk_tag_idx, np_topk_tag_score, tag_c)): clusters[clf.labels_[i]].append( (data, log_prob, i, distance[i][clf.labels_[i]])) for idx, cluster in enumerate(clusters): clusters[idx] = sorted(cluster, key=lambda x: -(x[3])) return clusters
def initiateBeta(num_of_variables, num_of_clusters, initialData): """ initiate beta """ #####simplex lattice design points weights = np.loadtxt("SLD5.txt", delimiter=" ") center = np.array( [[1.0 / num_of_variables for i in range(num_of_variables)]]) weights = np.concatenate((weights, center), axis=0) weights *= num_of_variables sqrtWeights = np.sqrt(weights) ys = [] for weight in sqrtWeights: estimator = KMeans(num_of_clusters) data = initialData * weight estimator.fit_predict(data) minDistance = np.min(estimator.transform(data), 1) square = np.power(minDistance, 2) sum_of_squares = np.sum(square) meanSquareDistance = sum_of_squares / (len(square) - 1) ys.append(meanSquareDistance) npys = np.array(ys) beta = lstsq(weights, npys)[0] return beta
def refine_centers(self, resultlist): ''' in this step, each cluster should be taken seriously, the algorithm could find centers which are close to each other, and combine to a single center the recommented input is the output of find_states_kmeans_step2() :param resultlist: a list contains dicts rach member in the dict is a cluster, described by a dict{"index","value","members","average_inertia"} :return:the kmeans object contains the refined cluster centers ''' threshold = 0.065 tocluster = [i["value"] for i in resultlist] maxclusters = len(resultlist) minstates = 2 if (maxclusters == 1): # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! minstates = 1 for n_clusters in range(minstates, maxclusters + 1): clusterer = KMeans(n_clusters=n_clusters, random_state=10) if (maxclusters == 1): clusterer = KMeans(n_clusters=1, random_state=10) clusterer.fit(tocluster) return clusterer clusterer.fit(tocluster) loss = clusterer.transform(tocluster) for i in loss: i.sort() if (i[0] > threshold * i[1]): # the n_clusters should be updated break return clusterer warnings.warn('zhai: all cluster centers are close to each other') return clusterer
def runKmeans(X, cluster_range): centroids = [] cld = [] clparms = [] basedist = 0 clustdist = [] interias = [] distortions = [] silhout = [] labelss = [] # # Run clustering for K=1 to K=9, save results # #k = range(2,9) for i in cluster_range: # kmeans is an object of KMeans class, bellow KMeans is the constructor. kmeans = KMeans(n_clusters=i, n_init=30) #Compute k-means clustering kmeans.fit(X) label = kmeans.labels_ labelss.append(label) #distortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_,'euclidean'),axis=1)) / X.shape[0]) # distortions.append(np.average(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))) pdist = [] # Transform X to a cluster-distance space. alldist = kmeans.transform(X) clustdist.append(alldist) #Coordinates of cluster centers centroid = kmeans.cluster_centers_ centroids.append(centroid) #Predict the closest cluster each sample in X belongs to. labels = kmeans.predict(X) #label = kmeans.labels_ #silhout.append(silhouette_score(X,label,metric='euclidean')) # distortions.append(np.sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'),axis=1)) / X.shape[0]) # Sum of squared distances of samples to their closest cluster center. interia = kmeans.inertia_ # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters # This condition to avoid error when # of clusters is 1 # It assumes that the silhouette value when k=1 is 1 if i == 1: silhouette_avg = 1 else: silhouette_avg = silhouette_score(X, labels, metric='euclidean') silhout.append(silhouette_avg) #cross = pd.crosstab(X,labels) interias.append(interia) if basedist == 0: basedist = interia cld.append(labels) clparms.append(interia / basedist) return (cld, clustdist, clparms, interias, distortions, silhout, centroids, labelss)
def KnnClassify(self,candi): words = self.extracAllword(candi) word_dict = {w:idx for idx, w in enumerate(words)} x = [[0 for _ in xrange(len(words))] for _ in xrange(len(candi))] if len(x) < 3: return candi for id, s in enumerate(candi): tmp = self.text_to_vector(s) for k,v in tmp.items(): x[id][word_dict[k]] = float(v) km = KMeans(n_clusters=3) km.fit(x) samples = {} X_new = km.transform(x) # try: # X_new = km.transform(x) # except: # print 'mooo' for idx, l in enumerate(km.labels_): try: samples[l][idx] = X_new[idx][l] except: samples[l] ={} samples[l][idx] = X_new[idx][l] ret = [] for k, v in samples.items(): sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True) for it in sortedv: ret.append(candi[it[0]]) return ret
def __init__(self, X, n_clusters, n_init=3, method="kmcuda"): if method == "kmcuda": self.inertia = np.inf for _ in range(n_init): centers, y_pred = kmeans_cuda(X.astype(np.float32), n_clusters) full_idx = np.arange(len(X)) centroids_idxs = [] inertia = 0 for i in range(n_clusters): idx = full_idx[y_pred == i] if len(idx) != 0: X_sub = X[idx] norm = la.norm(X_sub - centers[i], axis=1) min_idx = norm.argmin() centroids_idxs.append(idx[min_idx]) inertia += np.sum(norm) else: centroids_idxs.append(0) centroids_idxs = np.array(centroids_idxs) if inertia < self.inertia: self.centers = centers self.y_pred = y_pred self.centroids_idxs = centroids_idxs elif method == "sklearn": km = KMeans(n_clusters, n_init=n_init) self.y_pred = km.fit_predict(X) self.centers = km.cluster_centers_ self.centroids_idxs = km.transform(X).argmin(axis=0) else: raise NotImplementedError
def inertia_clustering_analysis(ds, max_clusters=13): inertia_val = np.array([]) #max_clusters = 13#+2 = 15 for i in np.arange(max_clusters) + 2: kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10) kmeans.transform(ds.samples) inertia_val = np.append(inertia_val, kmeans.inertia_) f = plt.figure() a = f.add_subplot(111) a.plot(inertia_val) plt.show() return inertia_val
def kmeanFinal(arr, K, rand_state): kmeans = KMeans(n_clusters=K, random_state=rand_state).fit(arr) kmeans_transform = kmeans.transform(arr) # km = KMeans(n_clusters=20, random_state=1) # distances = kmeans.fit_transform(arr) # print(distances) # iVal = total counts of vector points # label = centroid index centroids = kmeans.cluster_centers_ labels = kmeans.labels_ inertia = kmeans.inertia_ iVal=0 varianceVal = 0 retVal = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] retCount = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] for label in kmeans.labels_: print(label) varianceVal = varianceVal + kmeans_transform[iVal][label]*kmeans_transform[iVal][label] retVal[label] += kmeans_transform[iVal][label]*kmeans_transform[iVal][label] retCount[label] += 1 iVal = iVal + 1 return centroids , labels , inertia, kmeans_transform, iVal, varianceVal , retVal , retCount
def calculate_silhouette_score(self, best_score: int = -1, k_range: Tuple[int, int] = (2, 20)) -> int: """ Calculate the best number of clusters via Silhouette score method @param best_score: best score to start from @param k_range: possible range of cluster numbers @return: best quantity of clusters """ if k_range[0] < 2: _temp_k_range = list(k_range) _temp_k_range[0] = 2 k_range = tuple(_temp_k_range) for k in range(*k_range): model = KMeans( n_clusters=k, init="k-means++", max_iter=500, n_init=100, n_jobs=-1, algorithm="full", ) model.fit(self.matrix) labels = model.predict(self.matrix) score = silhouette_score(model.transform(self.matrix), labels) if score > best_score: self.best_k = k best_score = score print( f"Current cluster: {k}, silhouette score: {score} (current best K: {self.best_k})" ) print(f"The best K number is: {self.best_k}") return self.best_k
def get_dist_graph(all_points, num_anchor=300): """ get the cluster center as anchor by K-means++ and calculate distance graph (n data points vs m anchors), :param all_points: n data points :param num_anchor: m anchors, default = 300 :return: distance graph n X m """ # kmeans = KMeans (n_clusters=num_anchor, random_state=0, n_jobs=16, max_iter=50).fit_transform(all_points) # print ('dist graph done!') # return np.asarray(kmeans) ## smaple num_data = np.size(all_points, 0) sample_rate = 3000 # sample_rate = num_data ind = random.sample(range(num_data), sample_rate) sample_points = all_points[ind, :] kmeans = KMeans(n_clusters=num_anchor, random_state=0, n_jobs=16, max_iter=50).fit(sample_points) km = kmeans.transform(all_points) print('dist graph done!') return np.asarray(km)
def post_cluster(url, id, tfidf_vec): from sklearn.cluster import KMeans kmean = KMeans(n_clusters=300) print("kmeans") kmean.fit(tfidf_vec) pred = kmean.transform(tfidf_vec) count1 = 0 count2 = 0 pred_str = [] for item in pred: count1 += 1 vec = "" for tmp in item: vec += str(tmp)[0:7] + "\t" pred_str.append(vec) print(len(pred_str)) print(len(id)) pred = kmean.predict(tfidf_vec) fo = open(url + "/cluster.txt", "a+") for i in range(len(pred)): count2 += 1 fo.write(id[i] + "\t" + str(pred[i]) + "\n") fo.close() print("%d+%d" % (count1, count2))
def clustering(self, k): word_vectors = self.__model_p__.wv KM_model = KMeans(n_clusters=k, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors) center_closest = [] for i in range(k): center_closest.append([ el[0] for el in word_vectors.similar_by_vector( KM_model.cluster_centers_[i], topn=15, restrict_vocab=None) ]) metric_str = 'euclidean' score = silhouette_score(word_vectors.vectors, KM_model.predict(word_vectors.vectors), metric=metric_str) print("silhouette_score:", score) SVmodel = SilhouetteVisualizer(KM_model, is_fitted=True) SVmodel.fit(word_vectors.vectors) SVmodel.show() words = pd.DataFrame(word_vectors.vocab.keys(), columns=['words']) words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}']) words['cluster'] = words.vectors.apply( lambda x: KM_model.predict([np.array(x)])) words.cluster = words.cluster.apply(lambda x: x[0]) words['closeness_score'] = words.apply( lambda x: 1 / (KM_model.transform([x.vectors]).min()), axis=1) return KM_model, center_closest, score, words
def do_kmeans(X=None, n_clusters=None, articles_df=None, features=None): kmeans = KMeans(n_clusters=n_clusters, verbose=True) kmeans.fit(X) assigned_cluster = kmeans.transform(X).argmin(axis=1) print assigned_cluster print 'kmeans_class dist:', Counter(assigned_cluster) articles_df['kmeans.text_' + str(n_clusters).zfill(3)] = assigned_cluster top_centroids = kmeans.cluster_centers_.argsort()[:, -1:-20:-1] print 'top centroids:\n', top_centroids cl = [] for num, centroid in enumerate(top_centroids): cl.append([num, [", ".join(features[i] for i in centroid)]]) l = pd.DataFrame(cl) l.columns = ['kmeans.text_' + str(n_clusters).zfill(3), 'features'] print l articles_df = pd.merge(l, articles_df) print 'n,inertia', n_clusters, kmeans.inertia_ writecols = ['symbol', 'gics8', 'kmeans.text_' + str(n_clusters).zfill(3)] articles_df[writecols].to_csv('../data/kmeans_text.' + str(n_clusters).zfill(3) + '.csv', index=False) for i in range(kmeans.n_clusters): cluster = np.arange(0, X.shape[0])[assigned_cluster == i] #print cluster ss = articles_df.loc[articles_df['kmeans.text_' + str(n_clusters).zfill(3)] == i] ss.sort('mc', ascending=False, inplace=True) print "cluster {}:".format(i) print ss[['name', 'mc', 'gics8']][0:10] print ss.iloc[0][['features']].values.tolist() print articles_df.info(verbose=True, null_counts=True)
def inertia_clustering_analysis(ds, max_clusters=13): inertia_val = np.array([]) #max_clusters = 13#+2 = 15 for i in np.arange(max_clusters)+2: kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10) kmeans.transform(ds.samples) inertia_val = np.append(inertia_val, kmeans.inertia_) f = plt.figure() a = f.add_subplot(111) a.plot(inertia_val) plt.show() return inertia_val
def k_means(fname, dim=3, cluster_num=5, show_img=False): ''' Function to cluster the data into cluster_num groups and visualize them in a 3D space :param fname: the path of the pca_table file :param dim: the number of dimensions we want to use :param cluster_num: the number of clusters we want to cluster them :param show_img: show k-means image or not :return: NONE ''' assert isinstance(fname, str) data, names = pca_processing(fname, dim) X = np.array(data) k_means = KMeans(n_clusters=cluster_num).fit(X) labels = k_means.labels_ fig = plt.figure(1, figsize=(4, 3)) ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) ax.scatter(X[:, 1], X[:, 0], X[:, 2], c=labels.astype(np.float), edgecolor='k') if show_img is True: fig.show() distance = k_means.transform(X) return names, labels, distance
def kmeans_betacv(data, num_cluster, batch_kmeans=False, n_runs = 10, confidence = 0.90): ''' Computes the BetaCV for running Kmeans on the dataset. This method returns the BetaCV value and half of the size of the confidence interval for the same value (BetaCV is an average or the number of runs given). Arguments --------- data: matrix A matrix of observations. If this is sparse, `batch_kmeans` must be True num_cluster: int number of clusters to run k-means for batch_kmeans: bool (defauts to False) if `sklearn.cluster.MiniBatchKMeans` should be used. This is faster and suitable for sparse datasets, but less accurate. n_runs: int (default = 10) Number of runs to compute the BetaCV confidence: double [0, 1) (default = 0.9) The confidence used to compute half the confidence interval size Returns ------- The betacv and half of the confidence interval size ''' algorithm = None if not batch_kmeans: algorithm = KMeans(num_cluster) else: algorithm = MiniBatchKMeans(num_cluster) inter_array = np.zeros(n_runs) intra_array = np.zeros(n_runs) for i in xrange(n_runs): #Run K-Means algorithm.fit(data) centers = algorithm.cluster_centers_ labels = algorithm.labels_ #KMeans in sklearn uses euclidean dist_centers = pairwise.euclidean_distances(centers) #Inter distance mean_dist_between_centers = np.mean(dist_centers) inter_array[i] = mean_dist_between_centers #Intra distance dist_all_centers = algorithm.transform(data) intra_dists = [] for doc_id, cluster in enumerate(labels): dist = dist_all_centers[doc_id, cluster] intra_dists.append(dist) intra_array[i] = np.mean(intra_dists) betacv = intra_array / inter_array cinterval = half_confidence_interval_size(betacv, confidence) return np.mean(betacv), cinterval
def cluster_encode(X_train, X_test, codebook='kmeans', k=25): if codebook == 'kmeans': cb = KMeans(k, n_init=1, init='random') elif codebook == 'gmm': cb = GMM(n_components=k) X = np.vstack((X_train, X_test)) X = StandardScaler().fit_transform(X) print('_' * 80) print('fitting codebook') print print cb print cb.fit(X) print 'fin.' X_train = cb.transform(X_train) X_test = cb.transform(X_test) return X_train, X_test
def compute_clusters(topics, match): recipe_topics = topics['W'][match, :] cluster = KMeans(n_clusters=4) # cluster = AffinityPropagation() cluster.fit(recipe_topics) distances = cluster.transform(recipe_topics) return cluster, distances
def _cluster(self, index): data = self.data[index] kmeans = KMeans(n_clusters=2, random_state=0).fit(data) labels = kmeans.labels_ l_i = np.where(labels == 0)[0] r_i = np.where(labels == 1)[0] left_index = index[l_i] right_index = index[r_i] if len(right_index) - len(left_index) > 1: distances = kmeans.transform(data[r_i]) left_index, right_index = self._rebalance( left_index, right_index, distances[:, 1]) elif len(left_index) - len(right_index) > 1: distances = kmeans.transform(data[l_i]) left_index, right_index = self._rebalance( right_index, left_index, distances[:, 0]) return left_index, right_index
def test_transform(): km = KMeans(n_clusters=n_clusters) km.fit(X) X_new = km.transform(km.cluster_centers_) for c in range(n_clusters): assert_equal(X_new[c, c], 0) for c2 in range(n_clusters): if c != c2: assert_greater(X_new[c, c2], 0)
def test_transform(): k_means = KMeans(k=n_clusters) k_means.fit(X) X_new = k_means.transform(k_means.cluster_centers_) for c in range(n_clusters): assert_equal(X_new[c, c], 0) for c2 in range(n_clusters): if c != c2: assert_true(X_new[c, c2] > 0)
def run_k_means(df, numberclusters, geoidlabel ='geoid10', plot_silouette = True): '''Uses sklearn to run kmeans. ARGUMENTS: 1) df: A dataframe with a geoid column 2) geoidlabel: the label of the geoid column. 3) plot_silouette: whether or not to plot the silouettes of each cluster OUTPUT: Returns a three part tuple: 1) the kmeans sklearn model 2) a dictionary with geoids as the key, and the cluster as the value 3) a dictionary with clusters as the key, and a list of related geoids as the value''' #Use K means to cluster the dataset. x = df[['wkday_0','wkday_1','hrbin_morning', 'hrbin_afternoon','hrbin_evening', 'hrbin_latenight','hrbin_dawn']].values kmeans = KMeans(n_clusters = numberclusters) kmeans.fit(X = x ) features = df.columns.tolist()[1:] geoids = df[geoidlabel] #store values in a dictionary geoid_dict = defaultdict(int) cluster_dict = defaultdict(list) #Transforms x into a cluster-distance space. #In this array, each column is a cluster with the value of the distance from #a given neighborhood block (geoid) in each row. #This function returns the cluster belonging to each neighborhood block: #the cluster with the smallest distance value assigned_cluster = kmeans.transform(x).argmin(axis=1) for i in range(kmeans.n_clusters): cluster = np.arange(0, x.shape[0])[assigned_cluster==i] geoids = [df.ix[geoindx]['hrbin_'] for geoindx in cluster] print len(geoids), 'cluster #', i #make a dictionary with cluster as the key, and geoids as the list cluster_dict[i] = geoids #second dictionary to quickly look up what cluster each geoid belongs to for geo in geoids: geoid_dict[geo] = i if plot_silouette == True: plot_cluster_silouette_values(X, assigned_cluster, n_clusters) #save the dictionaries as CSVs save_dictionary_as_csv(cluster_dict, 'data/intermediate_data/kmeans/kmeans_clusterdict.csv') save_dictionary_as_csv(geoid_dict, 'data/intermediate_data/kmeans/kmeans_geoiddict.csv') return kmeans, geoid_dict, cluster_dict
def cluster_documents(n_clusters, doc_term_matrix): kmeans = KMeans(n_clusters=n_clusters) kmeans = kmeans.fit(doc_term_matrix) distances = kmeans.transform(doc_term_matrix) results = distances.argmin(axis=1) clusters = defaultdict(list) for document_index, cluster in enumerate(results): clusters[cluster].append((document_index, distances[document_index, cluster])) return clusters
def get_kmeans_features(tr_all,ts_all,n_clusters=3,normz=None,axis=0): tr_ids,tr_x_orig,tr_y = tr_all ts_ids,ts_x_orig = ts_all tr_x = np.copy(tr_x_orig) ts_x = np.copy(ts_x_orig) kmeans = KMeans(n_clusters) kmeans.fit(np.append(tr_x,ts_x,axis=0)) tf_tr_x = kmeans.transform(tr_x) tf_ts_x = kmeans.transform(ts_x) tf_tr_x,tf_ts_x = normalize_data(tf_tr_x,tf_ts_x,normz,axis) return (tr_ids,tf_tr_x,tr_y),(ts_ids,tf_ts_x)
def fit(self, X, Y=None): if self.method == 'random': N = len(X) idx = np.random.randint(N, size=self.M) self.samples = X[idx] elif self.method == 'normal': # just sample from N(0,1) D = X.shape[1] self.samples = np.random.randn(self.M, D) / np.sqrt(D) elif self.method == 'kmeans': X, Y = self._subsample_data(X, Y) print("Fitting kmeans...") t0 = datetime.now() kmeans = KMeans(n_clusters=len(set(Y))) kmeans.fit(X) print("Finished fitting kmeans, duration:", datetime.now() - t0) # calculate the most ambiguous points # we will do this by finding the distance between each point # and all cluster centers # and return which points have the smallest variance dists = kmeans.transform(X) # returns an N x K matrix variances = dists.var(axis=1) idx = np.argsort(variances) # smallest to largest idx = idx[:self.M] self.samples = X[idx] elif self.method == 'gmm': X, Y = self._subsample_data(X, Y) print("Fitting GMM") t0 = datetime.now() gmm = GaussianMixture( n_components=len(set(Y)), covariance_type='spherical', reg_covar=1e-6) gmm.fit(X) print("Finished fitting GMM, duration:", datetime.now() - t0) # calculate the most ambiguous points probs = gmm.predict_proba(X) ent = stats.entropy(probs.T) # N-length vector of entropies idx = np.argsort(-ent) # negate since we want biggest first idx = idx[:self.M] self.samples = X[idx] return self
def clusterize_kmeans(self, *args, **kwargs): """ Cluster hosts using KMeans algorithm n_clusters : the number of clusters to form Update self._clusters attribute """ # Launch KMeans algorithm with all available CPUs (n_jobs=-1). kwargs.setdefault('n_jobs', -1) classifier = KMeans(*args, **kwargs) classifier.fit(self._matrix) theorical_centers = classifier.cluster_centers_ center_hosts_idx = [] dist = classifier.transform(self._matrix) for i_center in xrange(len(theorical_centers)): center_hosts_idx.append(np.argsort(dist[:, i_center])[0]) cluster_labels = classifier.predict(self._matrix) self._clusters = KmeansClusters(kwargs['n_clusters'], cluster_labels, theorical_centers, center_hosts_idx)
def run_clustering(tags): # vector of tags acheaved from other models # tags = ['muslim','holy'] print "Reading word2vec model" model = utils_word2vec.read_word2vec() word_vectors = model.syn0 num_clusters = 2*len(tags) - 1 print model.most_similar('iraq') # Initalize a k-means object and use it to extract centroids print "Running K means" kmeans_clustering = KMeans( n_clusters = num_clusters) kmeanFit = kmeans_clustering.fit( word_vectors ) idx = kmeanFit.labels_ centers = kmeanFit.cluster_centers_ # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number word_centroid_map = dict(zip( model.index2word, idx )) clusterDist = kmeans_clustering.transform( word_vectors ) print clusterDist.shape cluster_tags = [] for i in range(0,num_clusters - 1): cluster_tags.append(model.index2word[np.argmax(clusterDist[:,i])]) mymax = np.argmax(clusterDist[:,i]) # print np.argmax(clusterDist[:,i]) # print "word" + model.index2word[mymax] centroid_word_map = dict(zip(centers, cluster_tags )) # Retrun relenavt claster tags top_clusters = create_bag_of_centroids( tags, word_centroid_map, centroid_word_map ).bag_of_centers print "Bag of centroids tags: \n" print top_clusters return top_clusters
o2o[ ind_dic[pair[1]], ind_dic[pair[0]], idx] = o2o[ind_dic[pair[0]], ind_dic[pair[1]], idx] # local prediction and blacklist generation part - this dictionary holds each contributor's local blacklist print 'Computing local predictions...' l_blacklists = dict() l_blacklists = ts.local_prediction(top_targets, train_set, i) # clustering part for n_clusters in clusters_values: print 'Kvalue: ', n_clusters estimator = KMeans(n_clusters=n_clusters) X_train = o2o.sum(axis=2) labels = estimator.fit( X_train ).labels_ distances = estimator.transform(X_train) # initial clusters clusters = [ np.where(labels == f)[0] for f in range(n_clusters)] # keep in the cluster only those contributors that satisfy the distance threshold c = [] for id1, cluster in enumerate(clusters): b = [distances[id][id1] for id in cluster] threshold = np.percentile(b, 40) c.append([idx for idx in cluster if distances[idx][id1] <= threshold]) clusters = [ top_targets[idy] for idy in c] conts_in_clusters = set() for m in clusters:
def cluster(self, page_dict_list): pd = PagesDataset(page_dict_list) self.pd = pd X_scaled = pd.features_lists print X_scaled # X_reduced = PCA(n_components=2).fit_transform(X_scaled) # print X_reduced kmeans = KMeans(k=self._K, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) kmeans.fit(X=X_scaled, y=None) # print( '% 9s %i %.3f %.3f %.3f %.3f %.3f %.3f' # % ("Example", kmeans.inertia_, # metrics.homogeneity_score(labels, kmeans.labels_), # metrics.completeness_score(labels, kmeans.labels_), # metrics.v_measure_score(labels, kmeans.labels_), # metrics.adjusted_rand_score(labels, kmeans.labels_), # metrics.adjusted_mutual_info_score(labels, kmeans.labels_), # metrics.silhouette_score(X_scaled, kmeans.labels_, # metric='euclidean', # sample_size=3))) Y = kmeans.predict(X_scaled) X_new = kmeans.transform(X_scaled, y=None) print Y print X_new print kmeans.cluster_centers_ pages_dict_list_clusters = [[],[]] for i,(page_dict,cluster_no,distance_pair) in enumerate((zip(page_dict_list,Y,X_new))): page_dict['ecom_kmeans_dist'] = distance_pair[cluster_no] pages_dict_list_clusters[cluster_no].append(page_dict) for i,pages_dict_list_cluster in enumerate(pages_dict_list_clusters): pages_dict_list_clusters[i] = sorted(pages_dict_list_cluster,key=lambda dict: dict['ecom_kmeans_dist']) product_pages_indices_list = [] category_pages_indices_list = [] cluster0_label = self.__label_cluster0(kmeans.cluster_centers_) if cluster0_label == "cat": cat_no = 0 prod_no = 1 retval = True elif cluster0_label == "prod": cat_no = 1 prod_no = 0 retval = True else: retval = False if retval == True: self.product_pages_dict_list = pages_dict_list_clusters[prod_no] self.category_pages_dict_list = pages_dict_list_clusters[cat_no] for i,cluster_no in enumerate(Y): if cluster_no == prod_no: product_pages_indices_list.append(i) elif cluster_no == cat_no: category_pages_indices_list.append(i) for dic in self.product_pages_dict_list: dic['category'] = 'ecom_product' for dic in self.category_pages_dict_list: dic['category'] = 'ecom_category' self.product_cluster_center = kmeans.cluster_centers_[prod_no] self.category_cluster_center = kmeans.cluster_centers_[cat_no] self.product_cluster_50pc_dist, self.product_cluster_80pc_dist = self.__get_50pc_80pc_distance(self.product_pages_dict_list) self.category_cluster_50pc_dist, self.category_cluster_80pc_dist = self.__get_50pc_80pc_distance(self.category_pages_dict_list) self.product_pages_indices_list = product_pages_indices_list self.category_pages_indices_list = category_pages_indices_list return retval
def run_stack(SEED): model = "" print "Running GB, RF, ET stack." trainBase = csv_io.read_data("../train.csv", skipFirstLine = True, split = ",") test = csv_io.read_data("../test.csv", skipFirstLine = True, split = ",") avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10). predicted_list = [] bootstrapLists = [] # use this for quick runs. # note RF with 150 crashes on 30 features # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestRegressor(n_estimators=100, n_jobs=1), #RandomForestRegressor(n_estimators=75, n_jobs=1), # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1), # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False) # ] #knn 5 at 3.45 #knn 15 at 3.31 #knn 25 at 3.30 #knn 40 at 3.31 # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # LinearRegression at 3.77 # Ridge at 3.77 # SGD 4.23 #Gauss at 13 # LinearRegression(fit_intercept=True, normalize=False, copy_X=True), # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001), # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False), # GaussianNB() # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # ] # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ****************** # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166) # ] # about 1 hour run time, and 3.10 score. #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166) # about 2 hours run time at 3.05 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166) # about 2 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166) # about 4 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166) #SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None) # http://stackoverflow.com/questions/15150339/python-memory-error-sklearn-huge-input-data #For high dimensional sparse data and many samples, LinearSVC, LogisticRegression, # PassiveAggressiveClassifier or SGDClassifier can be much faster to train for comparable predictive accuracy. # LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) # LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None) # PassiveAggressiveClassifier(C=1.0, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False) # SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None) clfs = [RandomForestClassifier(n_estimators=500, n_jobs=1, criterion='gini') ] # best SVC(C=1000000.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1), # best LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None), #SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,gamma=0.0, kernel='rbf', max_iter=-1, probability=False, shrinking=True,tol=0.001, verbose=False) # use this for quick runs. # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)] # use this for quick runs. reduced estimators to 50 # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, # tol=0.001, verbose=False) # ] #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1) # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)] # full algorithm stack. # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" #targetPre = [x[0] for x in trainBase] #trainPre = [x[1:] for x in trainBase] #trainPreTemp = [x[1:] for x in trainBase] #testPre = [x[1:] for x in test] targetPre = [int(x[0]) for x in trainBase] trainPre = [[int(i) for i in x[1:]] for x in trainBase] trainPreTemp = [[int(i) for i in x[1:]] for x in trainBase] testPre = [[int(i) for i in x[1:]] for x in test] print "unique: ", len(list(set([x[1] for x in trainBase]))) #enc = OneHotEncoder() #print len(trainPreTemp) #trainPreTemp.extend(testPre) #print len(trainPreTemp) #enc.fit(trainPreTemp) #print enc.n_values_ #print enc.feature_indices_ #out = enc.transform(trainPre) #trainPre = out#.toarray() #print out.shape # len(out), len(out[0]) #out = enc.transform(testPre) #testPre = out#.toarray() #print out.shape km = KMeans(n_clusters=10, init='k-means++', n_init=100, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1).fit(trainPre) #return #print trainPre[0] #scaler = preprocessing.Scaler().fit(trainPre) #trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #print scaler.mean_ #print scaler.std_ print "Begin Training" for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((len(test), NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] #Folds = cross_validation.StratifiedKFold(targetPre, n_folds=NumFolds, indices=True) Folds = cross_validation.KFold(len(trainBase), n_folds=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainPre[i] for i in train_index] #train = trainPre.tocsr()[train_index,:] targetTest = [targetPre[i] for i in test_index] trainTest = [trainPre[i] for i in test_index] #trainTest = trainPre.tocsr()[test_index,:] print print "Iteration: ", foldCount #print "LEN: ", len(train), len(target) train = km.transform(train) trainTest = km.transform(trainTest) clf.fit(train, target) print "Predict" prob = clf.predict_proba(trainTest) print "Score" dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 fpr, tpr, thresholds = metrics.roc_curve(targetTest, prob[:,1], pos_label=1) auc = metrics.auc(fpr,tpr) print "Score: ", auc #for i in range(0, len(prob)): #print prob #probX = prob[i] #probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) #weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) #print "Score: ", probSum/weightSum avg += auc/NumFolds predicted_probs = clf.predict_proba(testPre) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs[:,1] #[0] foldCount = foldCount + 1 break dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single_plus_index("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def _find_accuracy(home, appliance, feature="Monthly", num_homes=5): np.random.seed(42) appliance_df = df.ix[all_homes[appliance]] if appliance=="hvac": start, stop=5, 11 else: start, stop=1, 13 test_homes = [home] train_d = appliance_df[~appliance_df.index.isin([home])] train_d_index = train_d[['%s_%d' %(appliance, i) for i in range(start, stop)]].dropna().index train_d_feature = train_d.ix[train_d_index][feature_map[feature]].dropna() from sklearn.cluster import KMeans c = KMeans(n_clusters=num_homes) c.fit(train_d_feature) to_use = [] for i in range(num_homes): d = c.transform(train_d_feature)[:, i] ind = np.argsort(d)[::-1][:num_homes] flag=False start_index = 0 while flag is False: if train_d_feature.index.values[ind[start_index]] not in to_use: to_use.append(train_d_feature.index.values[ind[start_index]]) flag=True else: start_index = start_index+1 train_homes = np.array(to_use) all_home_appliance = deepcopy(all_homes) all_home_appliance[appliance] = train_homes # Cross validation on inner loop to find best feature, K train_size = len(train_homes) l = LeaveOneOut(train_size) out = OrderedDict() for cv_train, cv_test in l: cv_train_home=appliance_df.ix[train_homes[cv_train]] cv_train_index = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].dropna().index cv_train_home = cv_train_home.ix[cv_train_index] cv_test_home = appliance_df.ix[train_homes[cv_test]] test_home_name = cv_test_home.index.values[0] #print cv_test_home out[test_home_name]={} # Summing up energy across start to stop to get Y to learn optimum feature on Y = cv_train_home[['%s_%d' %(appliance, i) for i in range(start, stop)]].sum(axis=1).values forest = ExtraTreesRegressor(n_estimators=250, random_state=0) forest.fit(cv_train_home[feature_map[feature]], Y) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Now varying K and top-N features for K in range(K_min, K_max): out[test_home_name][K]={} for top_n in range(F_min,F_max): out[test_home_name][K][top_n]=[] top_n_features = cv_train_home[feature_map[feature]].columns[indices][:top_n] # Now fitting KNN on this for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K) clf.fit(cv_train_home[top_n_features], cv_train_home['%s_%d' %(appliance, month)]) #print clf.predict(cv_test_home[top_n_features]), month out[test_home_name][K][top_n].append(clf.predict(cv_test_home[top_n_features])) # Now, finding the (K, top_n) combination that gave us best accuracy on CV test homes accur = {} for K in range(K_min, K_max): accur[K] = {} for top_n in range(F_min, F_max): temp = {} for h in out.iterkeys(): pred = pd.DataFrame(out[h][K][top_n]).T #all_but_h = [x for x in out.keys() if x!=h] pred.index = [h] pred.columns = [['%s_%d' %(appliance, i) for i in range(start, stop)]] gt = appliance_df.ix[h][['%s_%d' %(appliance, i) for i in range(start, stop)]] error = (pred-gt).abs().div(gt).mul(100) mean_error = error.mean().mean() a = 100-mean_error if a<0: a=0 temp[h]=a ac = pd.Series(temp).mean() accur[K][top_n] = ac accur_df = pd.DataFrame(accur) accur_max = accur_df.max().max() max_ac_df = accur_df[accur_df==accur_max] F_best = cv_train_home[feature_map[feature]].columns[indices][:max_ac_df.mean(axis=1).dropna().index.values[0]].tolist() K_best = max_ac_df.mean().dropna().index.values[0] # Now predicting for test home train_overall = appliance_df.ix[appliance_df[~appliance_df.index.isin([home])].index] test_overall = appliance_df[appliance_df.index.isin([home])] pred_test = {} gt_test = {} for month in range(start, stop): clf = KNeighborsRegressor(n_neighbors=K_best) clf.fit(train_overall[F_best], train_overall['%s_%d' %(appliance, month)]) pred_test[month] = clf.predict(test_overall[F_best]) gt_test[month] = test_overall['%s_%d' %(appliance, month)] json.dump({'f':F_best, 'k':K_best,'accuracy':accur_max},open(os.path.expanduser("~/main-out-new-larger-num-homes/%d_%s_%s_%d.json" %(num_homes, appliance,feature, home)),"w") ) pred_df = pd.DataFrame(pred_test) pred_mean = df.ix[train_homes][['%s_%d' %(appliance, month) for month in range(start, stop)]].mean() pred_df.index = [home] #gt_df = pd.DataFrame(gt_test) #print pred_df, gt_df #error = (gt_df-pred_df).abs().div(gt_df).mul(100) #print error #accuracy_test = 100-error #accuracy_test[accuracy_test<0]=0 gt_df =df.ix[home][['%s_%d' %(appliance, month) for month in range(start, stop)]] gt_df.index = pred_df.columns pred_mean.index = pred_df.columns #return accuracy_test.squeeze() return pred_df, pred_mean,gt_df
rf = RFC(n_estimators=100, criterion='entropy') svm = SVC(kernel='rbf', probability=True) lr = LR() bl = [rf, lr, svm] #set of base learner for b in bl: b.fit(train_fd, train_label) #train each base classifier #print b test_fn = fn label = test_label class_ = np.unique(train_label) n_class = 32/2 c = KMeans(init='k-means++', n_clusters=n_class, n_init=10) c.fit(test_fn) dist = np.sort(c.transform(test_fn)) ex = dd(list) #example id, distance to centroid ex_id = dd(list) #example id for each C for i,j,k in zip(c.labels_, xrange(len(test_fn)), dist): ex[i].append([j,k[0]]) ex_id[i].append(int(j)) for i,j in ex.items(): ex[i] = sorted(j, key=lambda x: x[-1]) #sort ex in each C by dist to centroid nb_c = dd() for exx in ex_id.values(): exx = np.asarray(exx) for e in exx: nb_c[e] = exx[exx!=e] #create a dict of nb by C for each ex nb_f = [dd(), dd(), dd()] for b,n in zip(bl, nb_f): preds = b.predict(test_fd)
# In[25]: data = df_clean_d.values data # # D) Plot and analysis of LifeMale and LifeFemale # ## Application of KMeans on data # In[177]: clust=KMeans(n_clusters=3,n_init=10,init='k-means++',verbose=0)# Ckm=clust.fit_predict(data[:,3:5]) # Compute cluster centers and predict cluster index for each sample. data_d=clust.transform(data[:,3:5]) # #pour avoir les distances de chaque élément aux centres des clusters # In[179]: color=('g','b','r') label = ('First cluster', 'Second cluster','Third cluster') country0 = data[Ckm==0,0] country1 = data[Ckm==1,0] country2 = data[Ckm==2,0] # ## Visualisation of result of KMean applied on lifeMale and lifeFemale # In[339]:
class ContentRecommend(object): create_date = datetime.utcnow() days = 15 training_end = datetime.utcnow() db = None n_components = 20 # Number of dimension for TruncatedSVD account = '' svd = None normalizer = None svdX = None vectorizor = None training_docs = None threshold = 0.25 k_means = None sil_score = -1.0 cluster_count = 0 range_n_clusters = [3, 4, 5, 6, 7, 8] missionId = '' def __init__(self, mission_id, db_name='plover_development', db_port=27017, db_host='localhost'): self.missionId = mission_id config.LOGGER.info('Instantiation recommender') self.connect(db_name, self.missionId, db_port=db_port, db_host=db_host) config.LOGGER.debug("Loading NLTK stopword list for English") def connect(self, db_name="plover_development", mission_id="", db_port=27017, db_host='localhost'): config.LOGGER.info('Instantiating recommender object for mission %s', mission_id) config.LOGGER.debug('Using database %s, host %s and port %s', db_name, db_host, db_port) try: client = MongoClient(db_host, db_port) self.db = client[db_name] profile = self.db.socialProfile.find_one({'mission': ObjectId(self.missionId)}) self.account = self.db.linkedAccount.find_one({'_id': profile['account']}) if self.account is None: config.LOGGER.debug('No such account id') self.setup_training(days=30) except Exception as ex: config.LOGGER.error("Error %s opening mission _id=%s", ex.message, self.missionId) def get_updates(self, maximum=100, conditions={}): documents = [] config.LOGGER.info('Getting timeline updates for mission %s', self.missionId) config.LOGGER.debug(' query condition: %s', json.dumps(conditions, default=json_util.default)) try: if self.account is None: config.LOGGER.debug('No account id') else: projection = {'keywords': 1, 'text': 1, 'externalID': 1, 'postTime': 1, 'sender': 1, 'quotedStatus': 1} updates = self.db.statusUpdate.find(conditions, projection).sort('postTime', pymongo.DESCENDING).limit(maximum) for tw in updates: if 'quotedStatus' in tw: tw['text'] += " QT " + tw['quotedStatus']['text'] for keyword in tw['quotedStatus']['keywords']: tw['keywords'].append(keyword) smu = self.db.socialMediaUser.find_one({'_id': tw['sender']}, {'screenNameLC': 1}) if smu is not None: tw['keywords'].append(smu['screenNameLC']) documents.append(tw) except Exception as ex: config.LOGGER.error("Error %s getting updates from timeline for mission %s", ex.message, self.missionId) config.LOGGER.debug('Found %d updates in timeline', len(documents)) return documents def topics(self, n_components, n_out=7, n_weight=5, topic=None): config.LOGGER.info('Get topices timeline for %s', self.account['profile']['preferredUsername']) results = [] terms = self.vectorizer.get_feature_names() if topic is None: for k in range(n_components): idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) weight = np.mean([item[1] for item in sorted_idx[0:n_weight]]) for item in sorted_idx[0:n_out - 1]: results.append({'term': terms[item[0]], 'weight': item[1]}) else: m = max(self.svd.components_[topic]) idx = {i: abs(j) for i, j in enumerate(svd.components_[topic])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) weight = np.mean([item[1] for item in sorted_idx[0:n_weight]]) for item in sorted_idx[0:n_out - 1]: results.append({'term': terms[item[0]], 'weight': item[1]}) results def get_componentCount(self, min=.05): count = 0 for k in range(len(self.svd.components_)): idx = {i: abs(j) for i, j in enumerate(self.svd.components_[k])} sorted_idx = sorted(idx.items(), key=operator.itemgetter(1), reverse=True) kcount = 0 for entry in (sorted_idx): if entry[1] > min: kcount += 1 else: break if kcount > count: count = kcount return count def setup_training(self, end_time=datetime.utcnow(), days=15, maximum=1000): try: start = end_time - timedelta(minutes=days*24*60) condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': True}, {'sentByMe': True}], 'postTime': {'$gt': start, '$lte': end_time}, '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]} self.training_docs = self.get_updates(conditions=condition, maximum=10000) config.LOGGER.info('Train model for %s', self.account['profile']['preferredUsername']) if len(self.training_docs) > 50: config.LOGGER.debug('Found %d updates for training from %s', len(self.training_docs), self.account['profile']['preferredUsername']) self.training_end = end_time self.days = days trainingRaw = [' '.join(doc['keywords']) for doc in self.training_docs] #trainingRaw = [tw['text'] for tw in self.training_docs] self.vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, max_features=500, use_idf=True, strip_accents='ascii', ) X = self.vectorizer.fit_transform(trainingRaw) if X.shape[1] <= self.n_components: self.n_components = X.shape[1] - 1 config.LOGGER.debug('%d components found for SVD', self.n_components) self.svd = TruncatedSVD(self.n_components, algorithm='arpack') self.svdX = self.svd.fit_transform(X) # self.n_components = self.get_componentCount(self.threshold) # self.svd = TruncatedSVD(self.n_components, random_state=10) # self.svdX = self.svd.fit_transform(X) self.normalizer = Normalizer().fit(self.svdX) self.svdX = self.normalizer.transform(self.svdX) # Clustering config.LOGGER.debug('Determining cluster count ') for n_clusters in self.range_n_clusters: self.k_means = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, verbose=False, random_state=10) self.k_means.fit(self.svdX) score = metrics.silhouette_score(self.svdX, self.k_means.labels_) if score > self.sil_score: self.sil_score = score self.cluster_count = n_clusters config.LOGGER.debug('Cluster count is %d, Silhouette Coefficient is %0.3f ', self.cluster_count, self.sil_score) self.k_means = KMeans(n_clusters=self.cluster_count, init='k-means++', max_iter=100, n_init=4, verbose=False, random_state=10) self.k_means.fit(self.svdX) # now get the top tweets for each cluster x_transform = self.k_means.transform(self.svdX) x_predict = self.k_means.predict(self.svdX) self.all_cluster_dist = [] for i in range(self.cluster_count): cluster_distance = [] for j in range(len(x_predict)): if x_predict[j] == i and sum(self.svdX[j]) != 0.0: cluster_distance.append( {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in x_transform[j]]))}) newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False) self.all_cluster_dist.append(newlist) #now verify this self.self_test() else: config.LOGGER.info('Too few training updates from user timeline') self.svd = None except Exception as ex: config.LOGGER.exception("Error %s computing SVD and kmeans from user history for mission %s", ex.message, self.missionId) def self_test(self): try: config.LOGGER.info("Beginning self test. Better if it were cross validation but not enough data for that") results = self.find_recommendations(self.training_docs, top=10, quality=.001, min_examples=1) config.LOGGER.info("Self test found %d recommendations", len(results)) for rec in results: if rec['text'] != rec['samples_svd'][0]: config.LOGGER.error("Error training SVD for mission %s in tweet %s", self.missionId, rec['text']) except Exception as ex: config.LOGGER.error("Error in self test building training for mission %s", ex.message, self.missionId) def find_recommendations(self, tweets=[], top=10, quality=.1, min_examples=1): working_list = [] result_list = [] try: config.LOGGER.info('Generating content recommendations for user %s', self.account['profile']['preferredUsername']) if self.svd is not None: if len(tweets) < top: config.LOGGER.debug("Too few tweets passed for recommendation") return [] #tokenized_tweets = [' '.join(doc['newKeys']) for doc in tweets] #tweetText = [tw['text'] for tw in tweets] tweetText = [' '.join(tw['keywords']) for tw in tweets] Y = self.vectorizer.transform(tweetText) svdY = self.svd.transform(Y) svdY = self.normalizer.transform(svdY) y_transform = self.k_means.transform(svdY) # terms = self.vectorizer.get_feature_names() selected_updates = [] y_predict = self.k_means.predict(svdY) for i in range(self.cluster_count): cluster_distance = [] for j in range(len(y_predict)): if y_predict[j] == i and sum(svdY[j]) != 0.0: cluster_distance.append( {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in y_transform[j]]))}) newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False) selected_updates.append(newlist) temp = [entry for entry in it.izip_longest(*selected_updates)] clean_list = filter(lambda x: x is not None, [entry for tuple in temp for entry in tuple])[0:top] clean_list_svdY = [svdY[entry['index']] for entry in clean_list] config.LOGGER.debug("Found %i possible matches in topic clusters " % len(clean_list_svdY)) neigh = NearestNeighbors() neigh.fit(self.svdX) if len(clean_list_svdY) > 0: distances, svd_neighbors = neigh.radius_neighbors(X=clean_list_svdY, radius=quality) else: svd_neighbors =[] examples=[] for idx, entry in enumerate(svd_neighbors): if len(entry) >= min_examples: config.LOGGER.debug("Suggested tweet has %d examples" % len(entry)) original = tweets[clean_list[idx]['index']]['text'] for jdx, neighbor in enumerate(entry): examples.append({'text':self.training_docs[neighbor]['text'], 'dist':distances[idx][jdx]}) sorted_examples = sorted(examples, key=operator.itemgetter('dist'), reverse=False) min_examples = [item['text'] for item in sorted_examples][:min_examples] t1 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][0]['index']]['text'] t2 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][1]['index']]['text'] working_list.append({"dist": sorted_examples[0]['dist'], "text": original, "id": str(tweets[clean_list[idx]['index']]['_id']), "sender": str(tweets[clean_list[idx]['index']]['sender']), 'samples_svd': min_examples, 'samples_cluster':[t1,t2]}) result_list = sorted(working_list, key=operator.itemgetter('dist'), reverse=False) return result_list[:top] except Exception as ex: config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId) return [] def recommend_from_timeline(self, end_time=datetime.utcnow(), minutes_prior=15, top=10, quality=.1, min_examples=1): try: config.LOGGER.info("generating content recommendation from timeline for %s" % self.account['profile']['preferredUsername']) results = [] if self.svd is not None: start = end_time - timedelta(minutes=minutes_prior) condition = {'missions': ObjectId(self.missionId), '$or': [{'favorited': False}, {'sentByMe': False}, {'mentionsMe' : False},{'retweetOfMe':False}], 'postTime': {'$gt': start, '$lte': end_time}, '$nor':[{'keywords':{'$exists':False}},{'keywords':{'$size':1}},{'keywords':{'$size':2}}]} tweets = self.get_updates(maximum=10000, conditions=condition) config.LOGGER.debug('%d updates from account timeline read from database', len(tweets)) results = self.find_recommendations(tweets, top=top, quality=quality, min_examples=min_examples) config.LOGGER.debug('%d recommendations found for mission %s', len(tweets), self.missionId) return results[:top] except Exception as ex: config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId) return []
# # Reduce dimensions and apply clustering # In[6]: pca = PCA(n_components=50) X = pca.fit_transform(array(image_features)) # In[7]: kmeans = KMeans(n_clusters = num_clusters, n_init = 100, n_jobs=1) kmeans.fit(X) clusters = kmeans.predict(X) clusters_space = kmeans.transform(X) # In[8]: image_paths_hack = list(hog_features["image_paths"]) image_paths_rel = [image_path_hack.split("/")[-2] + "/" + image_path_hack.split("/")[-1] for image_path_hack in image_paths_hack] # In[9]: res_df = pd.DataFrame({'file_names' : image_paths_rel}) # In[10]:
def cluster_driver(a_driver): # print a_driver['DStats'] # print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################" # sys.stdout = open('a_projpath' +'output.txt','w') # print a_driver['DStats'] X = StandardScaler().fit_transform(a_driver['DStats']) # print X # print "DStats are.....::" , a_driver['DStats'] # print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X # print "############################Scaled X Above###################################################" pca = PCA(n_components=3) Xpca = pca.fit(X).transform(X) if plotflag == True: fig = scatterplot_matrix(np.transpose(Xpca) , ['PC1' , 'PC2' , 'PC3' # , 'PC4' # ,'PC5' ] ,linestyle='none', marker='o', color='black', mfc='none') fig.suptitle('Simple Scatterplot Matrix') plt.show() db = KMeans(n_clusters=3,n_jobs = -1).fit(Xpca) minDist = db.transform(Xpca).min(axis=1) # Get distance for each point from the nearest cluster zMinDist = sp.stats.mstats.zscore(minDist) # Convert to z-score i.e. Normalize the distances # zMinDist = abs(sp.stats.mstats.zscore(minDist)) # Convert to z-score i.e. Normalize the distances # plt.figure() # plt.hist(zMinDist,range=(-3,3)) # print zMinDist.round(2) # print "###############################################################################" # print sp.stats.kstest(zMinDist,"expon" ,(0,.715)) # print sp.stats.kstest(zMinDist,"expon" ,(0,.72)) tef = sp.stats.norm.fit(zMinDist) # tef = sp.stats.expon.fit(zMinDist) # plt.hist(sp.stats.expon.rvs(0,tef[1],size=200)) # print tef # print sp.stats.kstest(zMinDist,"expon" ,(0,.72)) # print sp.stats.kstest(zMinDist,"expon" ,(0,.75)) # print sp.stats.kstest(zMinDist,"expon" ,(0,76)) # print "###############################################################################" # probZMinDist = sp.stats.expon.pdf(zMinDist.round(2),loc=tef[0],scale=tef[1]) # Find the probability distribution this belongs to and get the probability probZMinDist = sp.stats.expon.pdf(zMinDist,scale=1/tef[1]) # Find the probability distribution this belongs to and get the probability # plt.figure() # plt.hist(probZMinDist) # print probZMinDist.round(2) # XpcaMean = Xpca.mean(axis=0) # XpcaDistFromMean = np.array([],float) # print "Xpca::", Xpca # print "XpcaMean::", XpcaMean # print "XpcaDistFromMean::", XpcaDistFromMean # for i in range(0,len(Xpca)): # XpcaDistFromMean = np.append(XpcaDistFromMean, np.linalg.norm(Xpca[i]-XpcaMean)) # temp = (1-sp.stats.norm.cdf(sp.stats.mstats.zscore(XpcaDistFromMean))) # temp = temp.round(2) # # print temp.round(2) # plt.figure() # plt.hist(temp,range=(0,1)) # plt.plot(temp,'b^') # plt.show() # db = DBSCAN(eps=0.7).fit(Xpca) # db = AgglomerativeClustering(n_clusters=5).fit(Xpca) # core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # print Counter(labels) # print labels # print "###############################################################################" print('Estimated number of clusters: %d' % n_clusters_) # print 'Count of Predicts::', len(X) # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(Xpca, labels)) print "% Variance Explaned: %0.3f" , sum(pca.explained_variance_ratio_) # print "##############################DBSCAN X Below#################################################" return probZMinDist ##KMeans ZScores
print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) #number of clusters clusters = 2 km = KMeans(n_clusters=clusters, init='k-means++', max_iter=100, n_init=10,verbose=opts.verbose) result = csr_matrix(result) km.fit(result) if not os.path.exists('../../data/result'): os.mkdir('../../data/result') dest = '../../data/result/raw_result' if not os.path.exists(dest): os.mkdir(dest) else: shutil.rmtree(dest) os.mkdir(dest) #order_centroids = km.cluster_centers_.argsort()[:, ::-1] for i in range(clusters): print("Cluster %d:" % i, end='') d = km.transform(result)[:, i] ind = np.argsort(d)[::-1][:10] f = open(dest +'/cluster' + str(i),'w') for index in ind: print(' %s' % index, end='\n') f.write(dataset.filenames[index] + '\n')
"cc_team_score_frame": cc_team_score_frame, "to_team_score_frame": to_team_score_frame, } ) return df_result # return (cc_score_frame,to_score_frame,to_team_score_frame) if __name__ == "__main__": t = main(50000) # t = main() t = t.fillna(0) kclust = KMeans(n_clusters=4) kclust.fit(t) clustered = kclust.transform(t) X = t fignum = 1 fig = plt.figure(fignum, figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) plt.cla() labels = kclust.labels_ ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float)) ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([])