Ejemplo n.º 1
0
def test_vbgmm_no_modify_alpha():
    alpha = 2.
    n_components = 3
    X, y = make_blobs(random_state=1)
    vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1)
    assert_equal(vbgmm.alpha, alpha)
    assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
Ejemplo n.º 2
0
def test_vbgmm_no_modify_alpha():
    alpha = 2.
    n_components = 3
    X, y = make_blobs(random_state=1)
    vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1)
    assert_equal(vbgmm.alpha, alpha)
    assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
Ejemplo n.º 3
0
    def _fit_vbgmm(self, x):
        # clustering
        for c in xrange(len(self.crange)):
            k = self.crange[c]
            for r in xrange(self.repeats):
                # info
                if self.debug is True:
                    print '\t[%s][c:%d][r:%d]' % (self.clus_type,
                                                  self.crange[c], r + 1),
                idx = c * self.repeats + r

                # fit and evaluate model
                model_kwargs = {}
                if 'alpha' in self.clus_kwargs:
                    model_kwargs.update(alpha=self.clus_kwargs['alpha'])
                if 'conv_thresh' in self.clus_kwargs:
                    model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
                model = VBGMM(n_components=k,
                              covariance_type=self.cvtype,
                              **model_kwargs)
                model.n_features = self.input_dim
                fit_kwargs = {}
                if 'max_iter' in self.clus_kwargs:
                    fit_kwargs.update(n_iter=self.clus_kwargs['max_iter'])
                model.fit(x, params='wmc', init_params='wmc', **fit_kwargs)
                self._labels[idx] = model.predict(x)
                self._parameters[idx] = model.means
                self._ll[idx] = model.score(x).sum()

                # evaluate goodness of fit
                self._gof[idx] = self.gof(x, self._ll[idx], k)

                # debug
                if self.debug is True:
                    print self._gof[idx], model.converged_
Ejemplo n.º 4
0
    def _fit_vbgmm(self, x):
        # clustering
        for c in xrange(len(self.crange)):
            k = self.crange[c]
            for r in xrange(self.repeats):
                # info
                if self.debug is True:
                    print '\t[%s][c:%d][r:%d]' % (
                        self.clus_type, self.crange[c], r + 1),
                idx = c * self.repeats + r

                # fit and evaluate model
                model_kwargs = {}
                if 'alpha' in self.clus_kwargs:
                    model_kwargs.update(alpha=self.clus_kwargs['alpha'])
                if 'conv_thresh' in self.clus_kwargs:
                    model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
                model = VBGMM(n_components=k, covariance_type=self.cvtype,
                              **model_kwargs)
                model.n_features = self.input_dim
                fit_kwargs = {}
                if 'max_iter' in self.clus_kwargs:
                    fit_kwargs.update(n_iter=self.clus_kwargs['max_iter'])
                model.fit(x, params='wmc', init_params='wmc', **fit_kwargs)
                self._labels[idx] = model.predict(x)
                self._parameters[idx] = model.means
                self._ll[idx] = model.score(x).sum()

                # evaluate goodness of fit
                self._gof[idx] = self.gof(x, self._ll[idx], k)

                # debug
                if self.debug is True:
                    print self._gof[idx], model.converged_
Ejemplo n.º 5
0
def main(method,cluster_num=30,alpha=.5):
    f ='/Users/davidgreenfield/Downloads/features_csv_tmp.csv'
    #f ='/Users/davidgreenfield/Downloads/features_f500.csv'
    cols=range(1,4096)
    feats =np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=(cols))
    asins = np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=([0]),dtype=str)
    if method == 'kmeans':
        k_means=cluster.KMeans(n_clusters=cluster_num)
        k_means.fit(feats)
        y = k_means.labels_
        if MAKE_GRAPH==1:
            print "hello 1"
        create_graph(k_means)
    elif method == 'GMM_VB':
        gmm_vb = VBGMM.fit(feats,n_components=50,alpha=.5)
        y = gmm_vb.predict(feats)
        cluster_no = len(np.unique(y))
    elif method == 'GMM_DP':
        gmm_dp = DPGMM(n_components=50,alpha=alpha)
        gmm_dp.fit(feats)
        y = gmm_dp.predict(feats)
        cluster_no = len(np.unique(y))


    clusters=[]
    groups={}
    data=load_data('./data/boots_aws.csv')

    for i in range(0,cluster_num):
        groups[i]=np.where(y==i)
        ids=asins[groups[i]]
        clusters.append(ids)
        links=[data[x]['url'] for x in ids]
        create_html(links,"templates/groups/group"+str(i)+".html")

    output_clusters(clusters,"outputs/clusters.csv")
Ejemplo n.º 6
0
def do_model(self, **kwds):
    return VBGMM(verbose=False, **kwds)
Ejemplo n.º 7
0
# Useful for seeing all sklearn estimators that have `predict_prob` attribute
estimators = all_estimators()
for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)

# Now pick and choose the ones you like
estimators = {
    AdaBoostClassifier(): 'AdaBoost',
    BayesianGaussianMixture(): 'BayesianGaussianMixture',
    BernoulliNB(): 'BernoulliNB',
    DPGMM(): 'DPGMM',
    ExtraTreesClassifier(): 'ExtraTreesClassifier',
    GMM(): 'GMM',
    GaussianNB(): 'GaussianNB',
    GaussianProcessClassifier(): 'GaussianProcessClassifier',
    GradientBoostingClassifier(): 'GradientBoostingClassifier',
    KNeighborsClassifier(): 'KNeighborsClassifier',
    LabelPropagation(): 'LabelPropagation',
    LabelSpreading(): 'LabelSpreading',
    LinearDiscriminantAnalysis(): 'LinearDiscriminantAnalysis',
    LogisticRegression(): 'LogisticRegression',
    MLPClassifier(): 'MLPClassifier',
    NuSVC(): 'NuSVC',
    QuadraticDiscriminantAnalysis(): 'QuadraticDiscriminantAnalysis',
    RandomForestClassifier(): 'RandomForestClassifier',
    SGDClassifier(): 'SGDClassifier',
    SVC(): 'SVC',
    VBGMM(): 'VBGMM'
}
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False):
    """
    The list of all classifiers was generated by running the following commented code.

    Args:
        a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets.
        a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will
        be printed to a file with the current date and time.
        a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally
        describing the run that is being performed.

    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test,  pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test,  pd.core.frame.Series)
    assert isinstance(print_output_scores_to_csv, bool)
    assert isinstance(output_scores_csv_file_suffix, object)

    import time

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from sklearn.utils.testing import all_estimators
    #estimators = all_estimators()
    #for name, class_ in estimators:
    #    log_print(name)

    from sklearn.calibration           import CalibratedClassifierCV
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.ensemble              import AdaBoostClassifier
    from sklearn.ensemble              import BaggingClassifier
    from sklearn.ensemble              import ExtraTreesClassifier
    from sklearn.ensemble              import GradientBoostingClassifier
    from sklearn.ensemble              import RandomForestClassifier
    from sklearn.gaussian_process      import GaussianProcessClassifier
    from sklearn.linear_model          import LogisticRegression
    from sklearn.linear_model          import LogisticRegressionCV
    from sklearn.linear_model          import SGDClassifier

    from sklearn.mixture               import BayesianGaussianMixture
    from sklearn.mixture               import DPGMM
    from sklearn.mixture               import GaussianMixture
    from sklearn.mixture               import GMM
    from sklearn.mixture               import VBGMM
    from sklearn.naive_bayes           import BernoulliNB
    from sklearn.naive_bayes           import GaussianNB
    from sklearn.neighbors             import KNeighborsClassifier
    from sklearn.neural_network        import MLPClassifier
    from sklearn.semi_supervised       import LabelPropagation
    from sklearn.semi_supervised       import LabelSpreading
    from sklearn.svm                   import SVC
    from sklearn.tree                  import DecisionTreeClassifier
    #from xgboost                       import XGBClassifier

    models = []
    models.append(('AdaBoostClassifier',            AdaBoostClassifier()))
    models.append(('BaggingClassifier',             BaggingClassifier()))
    models.append(('BayesianGaussianMixture',       BayesianGaussianMixture()))
    models.append(('BernoulliNB',                   BernoulliNB()))
    models.append(('CalibratedClassifierCV',        CalibratedClassifierCV()))
    models.append(('DPGMM',                         DPGMM()))
    models.append(('DecisionTreeClassifier',        DecisionTreeClassifier(random_state=SEED)))
    models.append(('ExtraTreesClassifier',          ExtraTreesClassifier(random_state=SEED)))
    models.append(('GMM',                           GMM()))
    models.append(('GaussianMixture',               GaussianMixture()))
    models.append(('GaussianNB',                    GaussianNB()))
    models.append(('GaussianProcessClassifier',     GaussianProcessClassifier()))
    models.append(('GradientBoostingClassifier',    GradientBoostingClassifier()))
    models.append(('KNeighborsClassifier',          KNeighborsClassifier()))
    models.append(('LabelPropagation',              LabelPropagation()))
    models.append(('LabelSpreading',                LabelSpreading()))
    models.append(('LinearDiscriminantAnalysis',    LinearDiscriminantAnalysis()))
    models.append(('LogisticRegression',            LogisticRegression()))
    models.append(('LogisticRegressionCV',          LogisticRegressionCV()))
    models.append(('MLPClassifier',                 MLPClassifier()))
    #models.append(('MultinomialNB', MultinomialNB()))
    #models.append(('NuSVC', NuSVC()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('RandomForestClassifier',        RandomForestClassifier(random_state=SEED)))
    models.append(('SGDClassifier',                 SGDClassifier()))
    models.append(('SVC',                           SVC()))
    models.append(('VBGMM',                         VBGMM()))
    #models.append(('XGBClassifier',                 XGBClassifier()))
    
    output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table)

    if print_output_scores_to_csv:
        output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv')

    return output_scores_df

def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True):
    """
    Run all classifiers of sklearn

    Args:
        X_train, X_test, y_train, y_test: The train and tests datasets.
        print_details: if true, print details of all models and save csv table ;
                       if false, print only table with summary of the models
    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test, pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test, pd.core.frame.Series)
    assert isinstance(print_details, bool)

    log_method_execution_time(log_funcname())

    from sklearn.utils.testing import all_estimators
    import sklearn.metrics
    import time
    from src.util.acq_util import RANDOM_SEED

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from xgboost import XGBClassifier
    #models.append(('XGBClassifier', XGBClassifier()))

    models = all_estimators(type_filter='classifier')
    output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0',
                                                'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'],
                                         columns=list(zip(*models))[0])

    for name, model in models:
        if print_details is True:
            print('------------------------------------------------------------------------------')
            print(name)
            print('------------------------------------------------------------------------------')

        if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'):
            continue

        model = model()
        if 'random_state' in model.get_params():
            model.random_state = SEED

        #Fitting the model.
        model.fit(X_train, y_train)

        #Measuring accuracy.
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset,
                                                       ['Accuracy on the train set', name], print_details)
        output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset,
                                                       ['Accuracy on the test set', name], print_details)

        #Plotting confusion matrix.
        output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details)

        #Showing classification report.
        if print_details is True:
            print(sklearn.metrics.classification_report(y_test, y_test_pred))

        # Printing scores to output dataset.
        output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name)

    # Can use idxmax with axis=1 to find the column with the greatest value on each row.
    output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1)
    #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1)

    if print_details is True:
        output_scores_dataset.to_csv('output_scores' + '.csv')

    return output_scores_dataset

def train_test_split_for_classification(dataset, label, test_size, random_state=SEED):
    """
    Selects X and y, considering that y has been renamed to label.
    """
    from sklearn.model_selection import train_test_split

    assert isinstance(dataset, pd.core.frame.DataFrame)
    assert isinstance(test_size, float)
    assert isinstance(random_state, int)

    X = dataset.loc[:, dataset.columns != label]
    y = dataset[g_label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    log_print('X_train: {}'.format(X_train.shape))
    log_print('y_train: {}'.format(y_train.shape))
    log_print('X_test:  {}'.format(X_test.shape))
    log_print('y_test:  {}'.format(y_test.shape))
    return(X_train, X_test, y_train, y_test)