def featureSelectAndParamTuning(nmf_components):
    print "trying with NMF n_components=", 50 * nmf_components
    model = NMF(n_components=50 * nmf_components,
                init='random',
                random_state=0,
                verbose=0)
    boost_input = model.fit_transform(data, y=y)
    #print "Finished NMF"
    print "Shape of NMF outPut is:", boost_input.shape

    X_train, X_test, y_train, y_test = train_test_split(boost_input,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    model = XGBClassifier(silent=True, seed=42)
    param_grid = {
        #'subsample':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        #'scale_pos_weight':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        #'colsample_bytree':[0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        'colsample_bylevel': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    }

    start_time = timeit.default_timer()
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

    grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=kfold)
    grid_result = grid_search.fit(X_train, y_train)

    # Get the estimator
    print grid_result.best_estimator_
    print 'CV Accuracy of best parameters: %3f' % grid_result.best_score_

    model = grid_result.best_estimator_
    y_pred = model.predict(X_test)

    print "Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score(
        y_test, y_pred)
    #accuracy_score(y_test, y_pred)
    elapsed = timeit.default_timer() - start_time
    print elapsed
tf_idf = tf_idf_vectorizor.fit_transform(tf_data)
tf_idf_norm = normalize(tf_idf)
tf_idf_array = tf_idf_norm.toarray()
pd.DataFrame(tf_idf_array,
             columns=tf_idf_vectorizor.get_feature_names()).head()

KM = KMeans(4)
NNMF = NMF(4)

fittedKM = KM.fit(tf_idf_array)
fittedNNMF = NNMF.fit(tf_idf_array)
featuresNNMF = NNMF.transform(tf_idf_array)
print(featuresNNMF)

predictionKM = KM.predict(tf_idf_array)
predictionNNMF = NNMF.predict(tf_idf_array)

plt.scatter(tf_idf_array[:, 2],
            tf_idf_array[:, 3],
            c=predictionKM,
            s=50,
            cmap='viridis')

centers2 = fitted.cluster_centers_
plt.scatter(centers2[:, 0], centers2[:, 1], c='black', s=300, alpha=0.6)

number_clusters = range(1, 7)

kmeans = [KMeans(n_clusters=i, max_iter=600) for i in number_clusters]
kmeans
print('val acc: ', hist.history['val_accuracy'][-1])

# %matplotlib inline
import matplotlib.pyplot as plt

plt.plot(hist.history['val_loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
"""- 모델 평가(Test Model)"""

test_loss = model.evaluate([test_df.userId, test_df.movieId], test_df.rating)

print('test loss: ', test_loss)
"""- ## 학습된 머신을 활용한 예측"""

pd.options.display.float_format = '{:.2f}'.format  # 출력 포매팅 설정
ratings_df[(ratings_df['userId'] == 249) & (ratings_df['movieId'] == 70)]
movies_df['movieId'].head(575)
ratings_df.loc[7000]

userId = 31  # 1 ~ 610
movieId = 165  # 1 ~ 193609  # sparse하고 ratings_df와 모두 대응되지도 않음
movie_title = list(movies_df[movies_df['movieId'] == movieId].title)[0]

user_v = np.expand_dims(userid2idx[userId], 0)
movie_v = np.expand_dims(movieid2idx[movieId], 0)
predict = model.predict([user_v, movie_v])

print('영화 {} 에 대한 사용자 ID {}님의 예상 별점은 {:.1f} 입니다.'.format(
    movie_title, userId, predict[0][0]))
Exemple #4
0
class mlexplorer:
    """use machine learning algorithms from scikit learn to explore spectroscopic datasets

    Performs automatic scaling and train/test split before NMF or PCA fit.

    Attributes
    ----------
    x : {array-like, sparse matrix}, shape = (n_samples, n_features)
        Spectra; n_features = n_frequencies.
    X_test : {array-like, sparse matrix}, shape = (n_samples, n_features)
        spectra organised in rows (1 row = one spectrum) that you want to use as a testing dataset. THose spectra should not be present in the x (training) dataset. The spectra should share a common X axis.
    algorithm : String,
        "PCA", "NMF", default = "PCA"
    scaling : Bool
        True or False. If True, data will be scaled prior to fitting (see below),
    scaler : String
        the type of scaling performed. Choose between MinMaxScaler or StandardScaler, see http://scikit-learn.org/stable/modules/preprocessing.html for details. Default = "MinMaxScaler".
    test_size : float
        the fraction of the dataset to use as a testing dataset; only used if X_test and y_test are not provided.
    rand_state : Float64
        the random seed that is used for reproductibility of the results. Default = 42.
    model : Scikit learn model
        A Scikit Learn object model, see scikit learn library documentation.

    Remarks
    -------

    For details on hyperparameters of each algorithms, please directly consult the documentation of SciKit Learn at:

    http://scikit-learn.org/stable/

    Results for machine learning algorithms can vary from run to run. A way to solve that is to fix the random_state.

    Example
    -------

    Given an array X of n samples by m frequencies, and Y an array of n x 1 concentrations

    >>> explo = rampy.mlexplorer(X) # X is an array of signals built by mixing two partial components
    >>> explo.algorithm = 'NMF' # using Non-Negative Matrix factorization
    >>> explo.nb_compo = 2 # number of components to use
    >>> explo.test_size = 0.3 # size of test set
    >>> explo.scaler = "MinMax" # scaler
    >>> explo.fit() # fitting!
    >>> W = explo.model.transform(explo.X_train_sc) # getting the mixture array
    >>> H = explo.X_scaler.inverse_transform(explo.model.components_) # components in the original space
    >>> plt.plot(X,H.T) # plot the two components

    """
    def __init__(self, x, **kwargs):
        """
        Parameters
        ----------
        x : array{Float64}
            the spectra organised in rows (1 row = one spectrum). The spectra should share a common X axis.

        """
        self.x = x
        #
        # Kwargs extractions
        #
        self.X_test = kwargs.get("X_test", [0.0])
        self.algorithm = kwargs.get("algorithm", "PCA")
        self.test_size = kwargs.get("test_size", 0.3)
        self.scaling = kwargs.get("scaling", True)
        self.scaler = kwargs.get("scaler", "MinMaxScaler")
        self.rand_state = kwargs.get("rand_state", 42)
        self.nb_compo = kwargs.get("n_components", 2)

        if len(self.X_test) == 1:
            self.X_train, self.X_test = sklearn.model_selection.train_test_split(
                self.x, test_size=self.test_size, random_state=self.rand_state)
        elif self.X_test.shape[1] == self.x.shape[1]:
            self.X_train = np.copy(self.x)
        else:
            ValueError(
                "You tried to provide a testing dataset that has a different number of features (in columns) than the training set. Please correct this."
            )

        # initialising the preprocessor scaler
        if self.scaler == "StandardScaler":
            self.X_scaler = sklearn.preprocessing.StandardScaler()
        elif self.scaler == "MinMaxScaler":
            self.X_scaler = sklearn.preprocessing.MinMaxScaler()
        else:
            InputError(
                "Choose the scaler between MinMaxScaler and StandardScaler")

        # fitting scaler
        self.X_scaler.fit(self.X_train)

        # scaling the data in all cases, it may not be used during the fit later
        self.X_train_sc = self.X_scaler.transform(self.X_train)
        self.X_test_sc = self.X_scaler.transform(self.X_test)

    def fit(self):
        """Train the model with the indicated algorithm.

        Do not forget to tune the hyperparameters.

        """
        if self.algorithm == "PCA":
            self.model = PCA(n_components=self.nb_compo)
        elif self.algorithm == "NMF":
            self.model = NMF(n_components=self.nb_compo, init="nndsvd")

        if self.scaling == True:
            self.model.fit(self.X_train_sc)
        else:
            self.model.fit(self.X_train)

    def refit(self):
        """Train the model with the indicated algorithm.

        Do not forget to tune the hyperparameters.

        """
        if self.scaling == True:
            self.model.fit(self.X_train_sc)
        else:
            self.model.fit(self.X_train)

    def predict(self, X):
        """Predict using the model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.

        Remark
        ------
        if self.scaling == "yes", scaling will be performed on the input X.
        """
        if self.scaling == True:
            X_sc = self.X_scaler.transform(X)
            pred_sc = self.model.predict(X_sc)
            return self.Y_scaler.inverse_transform(pred_sc.reshape(-1, 1))
        else:
            return self.model.predict(self.X)
class Topicmodel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,folder='model',modeltype='kmeans',topics=100,topwords=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = cPickle.load(open(folder+'/BoW_transformer.pickle'))
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics
        self.topwords = topwords
        if self.modeltype is 'kmeans':
            from sklearn.cluster import KMeans
            self.model = KMeans(n_clusters=topics,n_init=50)
        if self.modeltype is 'kpcakmeans':
            from sklearn.cluster import KMeans
            from sklearn.decomposition import KernelPCA
            self.model = {'kpca':KernelPCA(kernel='rbf',gamma=.1),\
                'kmeans':KMeans(n_clusters=topics,n_init=50)}
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self,X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow['tfidf_transformer'].fit_transform(\
            self.bow['count_vectorizer'].fit_transform(X))

        # transform word to BoW index into reverse lookup table
        words = self.bow['count_vectorizer'].vocabulary_.values()
        wordidx = self.bow['count_vectorizer'].vocabulary_.keys()
        self.idx2word = dict(zip(words,wordidx))         

        # depending on the model, train
        if self.modeltype is 'kmeans':
            Xc = self.model.fit_predict(X)
        if self.modeltype is 'kpcakmeans':
            Xc = self.model['kpca'].fit_transform(X)
            Xc = self.model['kmeans'].fit_predict(Xc)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X).argmax(axis=0)
        # for each cluster/topic compute covariance of word with cluster label
        # this measure is indicative of the importance of the word for the topic
        ass = zeros(self.topics)
        self.topicstats = []
        for cluster in range(self.topics): 
            # this is a binary vector, true if a data point was in this cluster
            y = double(Xc==cluster)
            # this is the covariance of the data with the cluster label
            Xcov = X.T.dot(y)
            # find the most strongly covarying (with the cluster label) words
            wordidx = reversed(Xcov.argsort()[-self.topwords:])
            topicwords = dict([(self.idx2word[idx],Xcov[idx]) for idx in wordidx])
            self.topicstats.append({'assignments':y.sum(),'clusterid':cluster,\
                'words': topicwords})

            print 'Topic %d: %3d Assignments '%(cluster,y.sum())\
                + 'Topwords: ' + ' '.join(topicwords.keys()[:10])

        datestr = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        fn = self.folder+'/topicmodel-%s-'%self.modeltype +datestr+'.json'
        print "Saving model stats to "+fn
        open(fn,'wb').write(json.dumps(self.topicstats))

    def predict(self,X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow['tfidf_transformer'].transform(\
            self.bow['count_vectorizer'].transform(X))
        if self.modeltype is 'kmeans':
            return self.model.predict(X)
        if self.modeltype is 'kpcakmeans':
            return self.model['kmeans'].predict(self.model['kpca'].transform(X))
        if self.modeltype is 'nmf':
            return self.model.transform(X).argmax(axis=0)
Exemple #6
0
class mlexplorer:
    """use machine learning algorithms from scikit learn to explore spectroscopic datasets

    Performs automatic scaling and train/test split before NMF or PCA fit.

    Attributes
    ----------
    x : {array-like, sparse matrix}, shape = (n_samples, n_features)
        Spectra; n_features = n_frequencies.
    X_test : {array-like, sparse matrix}, shape = (n_samples, n_features)
        spectra organised in rows (1 row = one spectrum) that you want to use as a testing dataset. THose spectra should not be present in the x (training) dataset. The spectra should share a common X axis.
    algorithm : String,
        "PCA", "NMF", default = "PCA"
    scaling : Bool
        True or False. If True, data will be scaled prior to fitting (see below),
    scaler : String
        the type of scaling performed. Choose between MinMaxScaler or StandardScaler, see http://scikit-learn.org/stable/modules/preprocessing.html for details. Default = "MinMaxScaler".
    test_size : float
        the fraction of the dataset to use as a testing dataset; only used if X_test and y_test are not provided.
    rand_state : Float64
        the random seed that is used for reproductibility of the results. Default = 42.
    model : Scikit learn model
        A Scikit Learn object model, see scikit learn library documentation.

    Remarks
    -------

    For details on hyperparameters of each algorithms, please directly consult the documentation of SciKit Learn at:

    http://scikit-learn.org/stable/

    Results for machine learning algorithms can vary from run to run. A way to solve that is to fix the random_state.

    Example
    -------

    Given an array X of n samples by m frequencies, and Y an array of n x 1 concentrations

    >>> explo = rampy.mlexplorer(X) # X is an array of signals built by mixing two partial components
    >>> explo.algorithm = 'NMF' # using Non-Negative Matrix factorization
    >>> explo.nb_compo = 2 # number of components to use
    >>> explo.test_size = 0.3 # size of test set
    >>> explo.scaler = "MinMax" # scaler
    >>> explo.fit() # fitting!
    >>> W = explo.model.transform(explo.X_train_sc) # getting the mixture array
    >>> H = explo.X_scaler.inverse_transform(explo.model.components_) # components in the original space
    >>> plt.plot(X,H.T) # plot the two components

    """

    def __init__(self,x,**kwargs):
        """
        Parameters
        ----------
        x : array{Float64}
            the spectra organised in rows (1 row = one spectrum). The spectra should share a common X axis.

        """
        self.x = x
        #
        # Kwargs extractions
        #
        self.X_test = kwargs.get("X_test",[0.0])
        self.algorithm = kwargs.get("algorithm","PCA")
        self.test_size = kwargs.get("test_size",0.3)
        self.scaling = kwargs.get("scaling",True)
        self.scaler = kwargs.get("scaler","MinMaxScaler")
        self.rand_state = kwargs.get("rand_state",42)
        self.nb_compo = kwargs.get("n_components",2)

        if len(self.X_test) == 1:
            self.X_train, self.X_test = sklearn.model_selection.train_test_split(
            self.x, test_size=self.test_size, random_state=self.rand_state)
        elif self.X_test.shape[1] == self.x.shape[1]:
            self.X_train = np.copy(self.x)
        else:
            ValueError("You tried to provide a testing dataset that has a different number of features (in columns) than the training set. Please correct this.")

        # initialising the preprocessor scaler
        if self.scaler == "StandardScaler":
            self.X_scaler = sklearn.preprocessing.StandardScaler()
        elif self.scaler == "MinMaxScaler":
            self.X_scaler = sklearn.preprocessing.MinMaxScaler()
        else:
            InputError("Choose the scaler between MinMaxScaler and StandardScaler")

        # fitting scaler
        self.X_scaler.fit(self.X_train)

        # scaling the data in all cases, it may not be used during the fit later
        self.X_train_sc = self.X_scaler.transform(self.X_train)
        self.X_test_sc = self.X_scaler.transform(self.X_test)

    def fit(self):
        """Train the model with the indicated algorithm.

        Do not forget to tune the hyperparameters.

        """
        if self.algorithm == "PCA":
            self.model = PCA(n_components=self.nb_compo)
        elif self.algorithm == "NMF":
            self.model = NMF(n_components=self.nb_compo,init = "nndsvd")

        if self.scaling == True:
            self.model.fit(self.X_train_sc)
        else:
            self.model.fit(self.X_train)

    def refit(self):
        """Train the model with the indicated algorithm.

        Do not forget to tune the hyperparameters.

        """
        if self.scaling == True:
            self.model.fit(self.X_train_sc)
        else:
            self.model.fit(self.X_train)

    def predict(self,X):
        """Predict using the model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.

        Remark
        ------
        if self.scaling == "yes", scaling will be performed on the input X.
        """
        if self.scaling == True:
            X_sc = self.X_scaler.transform(X)
            pred_sc = self.model.predict(X_sc)
            return self.Y_scaler.inverse_transform(pred_sc.reshape(-1,1))
        else:
            return self.model.predict(self.X)
Exemple #7
0
            color = 'm->'

        W = data_column
        sz = W.shape

        train = W[:int(sz[0] * 0.2)]
        test = W[int(sz[0] * 0.2):]

        y = class_column[:int(sz[0] * 0.2)]
        X = train

        test_Y = class_column[int(sz[0] * 0.2):]
        test_X = test

        model = model.fit(X, y)
        y_hat = model.predict(test_X)
        scores = cross_val_score(model, X, y, cv=10)
        dim = np.arange(len(scores))

        ax.plot(scores, color, mfc='none')
        #ax.tight_layout()

        #print ("Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score(test_Y, y_hat))
        #print ("Hamming loss: ", hamming_loss(test_Y, y_hat))
        #print ("Average precision score:", precision_score(test_Y, y_hat, average='macro'))

        #print ("Confusion matrix:\n", confusion_matrix(test_Y, y_hat))
    ax.legend(['LR', 'LDA', 'NB', 'MLP', 'SVM'])
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Precision')
    ax.set_xticks(dim)
Exemple #8
0
class Topicmodel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,
                 folder='model',
                 modeltype='kmeans',
                 topics=100,
                 topwords=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = cPickle.load(open(folder + '/BoW_transformer.pickle'))
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics
        self.topwords = topwords
        if self.modeltype is 'kmeans':
            from sklearn.cluster import KMeans
            self.model = KMeans(n_clusters=topics, n_init=50)
        if self.modeltype is 'kpcakmeans':
            from sklearn.cluster import KMeans
            from sklearn.decomposition import KernelPCA
            self.model = {'kpca':KernelPCA(kernel='rbf',gamma=.1),\
                'kmeans':KMeans(n_clusters=topics,n_init=50)}
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self, X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow['tfidf_transformer'].fit_transform(\
            self.bow['count_vectorizer'].fit_transform(X))

        # transform word to BoW index into reverse lookup table
        words = self.bow['count_vectorizer'].vocabulary_.values()
        wordidx = self.bow['count_vectorizer'].vocabulary_.keys()
        self.idx2word = dict(zip(words, wordidx))

        # depending on the model, train
        if self.modeltype is 'kmeans':
            Xc = self.model.fit_predict(X)
        if self.modeltype is 'kpcakmeans':
            Xc = self.model['kpca'].fit_transform(X)
            Xc = self.model['kmeans'].fit_predict(Xc)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X).argmax(axis=0)
        # for each cluster/topic compute covariance of word with cluster label
        # this measure is indicative of the importance of the word for the topic
        ass = zeros(self.topics)
        self.topicstats = []
        for cluster in range(self.topics):
            # this is a binary vector, true if a data point was in this cluster
            y = double(Xc == cluster)
            # this is the covariance of the data with the cluster label
            Xcov = X.T.dot(y)
            # find the most strongly covarying (with the cluster label) words
            wordidx = reversed(Xcov.argsort()[-self.topwords:])
            topicwords = dict([(self.idx2word[idx], Xcov[idx])
                               for idx in wordidx])
            self.topicstats.append({'assignments':y.sum(),'clusterid':cluster,\
                'words': topicwords})

            print 'Topic %d: %3d Assignments '%(cluster,y.sum())\
                + 'Topwords: ' + ' '.join(topicwords.keys()[:10])

        datestr = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        fn = self.folder + '/topicmodel-%s-' % self.modeltype + datestr + '.json'
        print "Saving model stats to " + fn
        open(fn, 'wb').write(json.dumps(self.topicstats))

    def predict(self, X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow['tfidf_transformer'].transform(\
            self.bow['count_vectorizer'].transform(X))
        if self.modeltype is 'kmeans':
            return self.model.predict(X)
        if self.modeltype is 'kpcakmeans':
            return self.model['kmeans'].predict(
                self.model['kpca'].transform(X))
        if self.modeltype is 'nmf':
            return self.model.transform(X).argmax(axis=0)