def test_vbgmm_no_modify_alpha(): alpha = 2. n_components = 3 X, y = make_blobs(random_state=1) vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1) assert_equal(vbgmm.alpha, alpha) assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
def _fit_vbgmm(self, x): # clustering for c in xrange(len(self.crange)): k = self.crange[c] for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, self.crange[c], r + 1), idx = c * self.repeats + r # fit and evaluate model model_kwargs = {} if 'alpha' in self.clus_kwargs: model_kwargs.update(alpha=self.clus_kwargs['alpha']) if 'conv_thresh' in self.clus_kwargs: model_kwargs.update(thresh=self.clus_kwargs['conv_thresh']) model = VBGMM(n_components=k, covariance_type=self.cvtype, **model_kwargs) model.n_features = self.input_dim fit_kwargs = {} if 'max_iter' in self.clus_kwargs: fit_kwargs.update(n_iter=self.clus_kwargs['max_iter']) model.fit(x, params='wmc', init_params='wmc', **fit_kwargs) self._labels[idx] = model.predict(x) self._parameters[idx] = model.means self._ll[idx] = model.score(x).sum() # evaluate goodness of fit self._gof[idx] = self.gof(x, self._ll[idx], k) # debug if self.debug is True: print self._gof[idx], model.converged_
def _fit_vbgmm(self, x): # clustering for c in xrange(len(self.crange)): k = self.crange[c] for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % ( self.clus_type, self.crange[c], r + 1), idx = c * self.repeats + r # fit and evaluate model model_kwargs = {} if 'alpha' in self.clus_kwargs: model_kwargs.update(alpha=self.clus_kwargs['alpha']) if 'conv_thresh' in self.clus_kwargs: model_kwargs.update(thresh=self.clus_kwargs['conv_thresh']) model = VBGMM(n_components=k, covariance_type=self.cvtype, **model_kwargs) model.n_features = self.input_dim fit_kwargs = {} if 'max_iter' in self.clus_kwargs: fit_kwargs.update(n_iter=self.clus_kwargs['max_iter']) model.fit(x, params='wmc', init_params='wmc', **fit_kwargs) self._labels[idx] = model.predict(x) self._parameters[idx] = model.means self._ll[idx] = model.score(x).sum() # evaluate goodness of fit self._gof[idx] = self.gof(x, self._ll[idx], k) # debug if self.debug is True: print self._gof[idx], model.converged_
def main(method,cluster_num=30,alpha=.5): f ='/Users/davidgreenfield/Downloads/features_csv_tmp.csv' #f ='/Users/davidgreenfield/Downloads/features_f500.csv' cols=range(1,4096) feats =np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=(cols)) asins = np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=([0]),dtype=str) if method == 'kmeans': k_means=cluster.KMeans(n_clusters=cluster_num) k_means.fit(feats) y = k_means.labels_ if MAKE_GRAPH==1: print "hello 1" create_graph(k_means) elif method == 'GMM_VB': gmm_vb = VBGMM.fit(feats,n_components=50,alpha=.5) y = gmm_vb.predict(feats) cluster_no = len(np.unique(y)) elif method == 'GMM_DP': gmm_dp = DPGMM(n_components=50,alpha=alpha) gmm_dp.fit(feats) y = gmm_dp.predict(feats) cluster_no = len(np.unique(y)) clusters=[] groups={} data=load_data('./data/boots_aws.csv') for i in range(0,cluster_num): groups[i]=np.where(y==i) ids=asins[groups[i]] clusters.append(ids) links=[data[x]['url'] for x in ids] create_html(links,"templates/groups/group"+str(i)+".html") output_clusters(clusters,"outputs/clusters.csv")
def do_model(self, **kwds): return VBGMM(verbose=False, **kwds)
# Useful for seeing all sklearn estimators that have `predict_prob` attribute estimators = all_estimators() for name, class_ in estimators: if hasattr(class_, 'predict_proba'): print(name) # Now pick and choose the ones you like estimators = { AdaBoostClassifier(): 'AdaBoost', BayesianGaussianMixture(): 'BayesianGaussianMixture', BernoulliNB(): 'BernoulliNB', DPGMM(): 'DPGMM', ExtraTreesClassifier(): 'ExtraTreesClassifier', GMM(): 'GMM', GaussianNB(): 'GaussianNB', GaussianProcessClassifier(): 'GaussianProcessClassifier', GradientBoostingClassifier(): 'GradientBoostingClassifier', KNeighborsClassifier(): 'KNeighborsClassifier', LabelPropagation(): 'LabelPropagation', LabelSpreading(): 'LabelSpreading', LinearDiscriminantAnalysis(): 'LinearDiscriminantAnalysis', LogisticRegression(): 'LogisticRegression', MLPClassifier(): 'MLPClassifier', NuSVC(): 'NuSVC', QuadraticDiscriminantAnalysis(): 'QuadraticDiscriminantAnalysis', RandomForestClassifier(): 'RandomForestClassifier', SGDClassifier(): 'SGDClassifier', SVC(): 'SVC', VBGMM(): 'VBGMM' }
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False): """ The list of all classifiers was generated by running the following commented code. Args: a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets. a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will be printed to a file with the current date and time. a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally describing the run that is being performed. Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_output_scores_to_csv, bool) assert isinstance(output_scores_csv_file_suffix, object) import time # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from sklearn.utils.testing import all_estimators #estimators = all_estimators() #for name, class_ in estimators: # log_print(name) from sklearn.calibration import CalibratedClassifierCV from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegressionCV from sklearn.linear_model import SGDClassifier from sklearn.mixture import BayesianGaussianMixture from sklearn.mixture import DPGMM from sklearn.mixture import GaussianMixture from sklearn.mixture import GMM from sklearn.mixture import VBGMM from sklearn.naive_bayes import BernoulliNB from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.semi_supervised import LabelPropagation from sklearn.semi_supervised import LabelSpreading from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier #from xgboost import XGBClassifier models = [] models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) models.append(('BernoulliNB', BernoulliNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('DPGMM', DPGMM())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=SEED))) models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=SEED))) models.append(('GMM', GMM())) models.append(('GaussianMixture', GaussianMixture())) models.append(('GaussianNB', GaussianNB())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('LogisticRegression', LogisticRegression())) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('MLPClassifier', MLPClassifier())) #models.append(('MultinomialNB', MultinomialNB())) #models.append(('NuSVC', NuSVC())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('RandomForestClassifier', RandomForestClassifier(random_state=SEED))) models.append(('SGDClassifier', SGDClassifier())) models.append(('SVC', SVC())) models.append(('VBGMM', VBGMM())) #models.append(('XGBClassifier', XGBClassifier())) output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table) if print_output_scores_to_csv: output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv') return output_scores_df def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True): """ Run all classifiers of sklearn Args: X_train, X_test, y_train, y_test: The train and tests datasets. print_details: if true, print details of all models and save csv table ; if false, print only table with summary of the models Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_details, bool) log_method_execution_time(log_funcname()) from sklearn.utils.testing import all_estimators import sklearn.metrics import time from src.util.acq_util import RANDOM_SEED # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from xgboost import XGBClassifier #models.append(('XGBClassifier', XGBClassifier())) models = all_estimators(type_filter='classifier') output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0', 'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'], columns=list(zip(*models))[0]) for name, model in models: if print_details is True: print('------------------------------------------------------------------------------') print(name) print('------------------------------------------------------------------------------') if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'): continue model = model() if 'random_state' in model.get_params(): model.random_state = SEED #Fitting the model. model.fit(X_train, y_train) #Measuring accuracy. y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset, ['Accuracy on the train set', name], print_details) output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset, ['Accuracy on the test set', name], print_details) #Plotting confusion matrix. output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details) #Showing classification report. if print_details is True: print(sklearn.metrics.classification_report(y_test, y_test_pred)) # Printing scores to output dataset. output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name) # Can use idxmax with axis=1 to find the column with the greatest value on each row. output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1) #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1) if print_details is True: output_scores_dataset.to_csv('output_scores' + '.csv') return output_scores_dataset def train_test_split_for_classification(dataset, label, test_size, random_state=SEED): """ Selects X and y, considering that y has been renamed to label. """ from sklearn.model_selection import train_test_split assert isinstance(dataset, pd.core.frame.DataFrame) assert isinstance(test_size, float) assert isinstance(random_state, int) X = dataset.loc[:, dataset.columns != label] y = dataset[g_label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) log_print('X_train: {}'.format(X_train.shape)) log_print('y_train: {}'.format(y_train.shape)) log_print('X_test: {}'.format(X_test.shape)) log_print('y_test: {}'.format(y_test.shape)) return(X_train, X_test, y_train, y_test)