class NMFImpl(): def __init__(self, n_components=None, init=None, solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=200, random_state=None, alpha=0.0, l1_ratio=0.0, verbose=0, shuffle=False): self._hyperparams = { 'n_components': n_components, 'init': init, 'solver': solver, 'beta_loss': beta_loss, 'tol': tol, 'max_iter': max_iter, 'random_state': random_state, 'alpha': alpha, 'l1_ratio': l1_ratio, 'verbose': verbose, 'shuffle': shuffle} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
def a(): data_matrix = numpy.ones((7, 5)) * 2.5 data_matrix[0, 0] = data_matrix[0, 1] = data_matrix[0, 2] = 1 data_matrix[1, 0] = data_matrix[1, 1] = data_matrix[1, 2] = 3 data_matrix[2, 0] = data_matrix[2, 1] = data_matrix[2, 2] = 4 data_matrix[3, 0] = data_matrix[3, 1] = data_matrix[3, 2] = 5 data_matrix[4, 3] = data_matrix[4, 4] = 4 data_matrix[4, 1] = 2 data_matrix[5, 3] = data_matrix[5, 4] = 5 data_matrix[6, 3] = data_matrix[6, 4] = 2 print data_matrix W, H = factorize(data_matrix) print 'nimfa', numpy.dot(W, H) rating_vector = numpy.zeros((5, 1)) * 2.5 rating_vector[0, 0] = 4 result = numpy.dot(H, rating_vector) result = numpy.dot(numpy.transpose(H), result) print result svd = NMF(n_components=2); W = svd.fit_transform(data_matrix) H = svd.components_ print numpy.dot(W, H) rating_vector = numpy.ones((5, 1)) * 2.5 rating_vector[0, 0] = 4 result = numpy.dot(H, rating_vector) result = numpy.dot(numpy.transpose(H), result) print result print result.max()
class NMFRecommender(BaseEstimator, RegressorMixin): def __init__(self, n_components): self.nmf = NMF(n_components=2, init='random', random_state=0) self.user_ids_dict = {} self.book_isbns_dict = {} def fit(self, X, y=None): self.sparse_matrix = X['sparse_matrix'] self.user_ids_dict = X['user_ids_dict'] self.book_isbns_dict = X['book_isbns_dict'] self.nmf.fit(X['sparse_matrix']) def predict(self, X, y=None): ratings = X['ratings'] user_representations = self.nmf.transform(self.sparse_matrix) book_representations = self.nmf.components_ estimations = [] for i in tqdm(range(len(ratings))): estimation = np.dot( user_representations[self.user_ids_dict[ ratings.iloc[i]['User-ID']]], book_representations)[ self.book_isbns_dict[ratings.iloc[i]['ISBN']]] estimations.append(estimation) return estimations def fit_predict(self, X, y=None): self.fit(X, y) return self.predict(X, y)
def factorize_data_matrix(self): self.logger.info("Going to factorize matrix") DM_W_FILE = '../../resources/scikit_dm_W.npy' DM_H_FILE = '../../resources/scikit_dm_H.npy' if os.path.isfile(DM_W_FILE) and os.path.isfile(DM_H_FILE): self.W = numpy.load(DM_W_FILE) self.H = numpy.load(DM_H_FILE) else: svd = NMF(n_components=25); self.W = svd.fit_transform(self.data_matrix) self.H = svd.components_ self.rmse() numpy.save(DM_W_FILE, self.W) numpy.save(DM_H_FILE, self.H) return self.W, self.H
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def __init__(self, n_components=None, init=None, solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=200, random_state=None, alpha=0.0, l1_ratio=0.0, verbose=0, shuffle=False): self._hyperparams = { 'n_components': n_components, 'init': init, 'solver': solver, 'beta_loss': beta_loss, 'tol': tol, 'max_iter': max_iter, 'random_state': random_state, 'alpha': alpha, 'l1_ratio': l1_ratio, 'verbose': verbose, 'shuffle': shuffle } self._wrapped_model = Op(**self._hyperparams)
def nmf(data, components): nmf = NMF(n_components=components, init="random", random_state=0) W = nmf.fit_transform(data) H = nmf.components_ R = np.dot(W, H) return W
'MDS':MDS(), 'MLPClassifier':MLPClassifier(), 'MLPRegressor':MLPRegressor(), 'MaxAbsScaler':MaxAbsScaler(), 'MeanShift':MeanShift(), 'MinCovDet':MinCovDet(), 'MinMaxScaler':MinMaxScaler(), 'MiniBatchDictionaryLearning':MiniBatchDictionaryLearning(), 'MiniBatchKMeans':MiniBatchKMeans(), 'MiniBatchSparsePCA':MiniBatchSparsePCA(), 'MultiTaskElasticNet':MultiTaskElasticNet(), 'MultiTaskElasticNetCV':MultiTaskElasticNetCV(), 'MultiTaskLasso':MultiTaskLasso(), 'MultiTaskLassoCV':MultiTaskLassoCV(), 'MultinomialNB':MultinomialNB(), 'NMF':NMF(), 'NearestCentroid':NearestCentroid(), 'NearestNeighbors':NearestNeighbors(), 'Normalizer':Normalizer(), 'NuSVC':NuSVC(), 'NuSVR':NuSVR(), 'Nystroem':Nystroem(), 'OAS':OAS(), 'OneClassSVM':OneClassSVM(), 'OrthogonalMatchingPursuit':OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV':OrthogonalMatchingPursuitCV(), 'PCA':PCA(), 'PLSCanonical':PLSCanonical(), 'PLSRegression':PLSRegression(), 'PLSSVD':PLSSVD(), 'PassiveAggressiveClassifier':PassiveAggressiveClassifier(),
def __init__(self, n_components): self.nmf = NMF(n_components=2, init='random', random_state=0) self.user_ids_dict = {} self.book_isbns_dict = {}
def benchmark(samples_range, features_range, rank=50, tolerance=1e-5): timeset = defaultdict(lambda: []) err = defaultdict(lambda: []) for n_samples in samples_range: for n_features in features_range: print("%2d samples, %2d features" % (n_samples, n_features)) print('=======================') X = np.abs( make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2)) gc.collect() print("benchmarking nndsvd-nmf: ") tstart = time() m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) tend = time() - tstart timeset['nndsvd-nmf'].append(tend) err['nndsvd-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking nndsvda-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvda-nmf'].append(tend) err['nndsvda-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking nndsvdar-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvdar-nmf'].append(tend) err['nndsvdar-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking random-nmf") tstart = time() m = NMF(n_components=30, init='random', max_iter=1000, tol=tolerance).fit(X) tend = time() - tstart timeset['random-nmf'].append(tend) err['random-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking alt-random-nmf") tstart = time() W, H = alt_nnmf(X, r=30, init='random', tol=tolerance) tend = time() - tstart timeset['alt-random-nmf'].append(tend) err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) report(norm(X - np.dot(W, H)), tend) return timeset, err
def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7): it = 0 timeset = defaultdict(lambda: []) err = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = np.abs( make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2)) gc.collect() print "benching nndsvd-nmf: " tstart = time() m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) tend = time() - tstart timeset['nndsvd-nmf'].append(tend) err['nndsvd-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching nndsvda-nmf: " tstart = time() m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvda-nmf'].append(tend) err['nndsvda-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching nndsvdar-nmf: " tstart = time() m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvdar-nmf'].append(tend) err['nndsvdar-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching random-nmf" tstart = time() m = NMF(n_components=30, init=None, max_iter=1000, tol=tolerance).fit(X) tend = time() - tstart timeset['random-nmf'].append(tend) err['random-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching alt-random-nmf" tstart = time() W, H = alt_nnmf(X, r=30, R=None, tol=tolerance) tend = time() - tstart timeset['alt-random-nmf'].append(tend) err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) print np.linalg.norm(X - np.dot(W, H)), tend return timeset, err
scorer = make_scorer(score_func=singleLabelScore, greater_is_better=False) # PREPROCESSING # SCALING minMaxScaler = MinMaxScaler(feature_range=(0.0, 1.0)) #normalizer = skprep.Normalizer() columnDeleter = fs.FeatureDeleter() # FEATURE SELECTION varianceThresholdSelector = VarianceThreshold(threshold=(0)) percentileSelector = SelectPercentile(score_func=f_classif, percentile=20) kBestSelector = SelectKBest(f_classif, 1000) # FEATURE EXTRACTION #rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)]) nmf = NMF(n_components=150) pca = PCA(n_components=80) sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2) kernel_pca = KernelPCA(n_components=150) # Costs huge amounts of ram randomized_pca = RandomizedPCA(n_components=500) # REGRESSORS random_forest_regressor = RandomForestRegressor(n_estimators=256) gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60) support_vector_regressor = svm.SVR() # CLASSIFIERS support_vector_classifier = svm.SVC(probability=True, verbose=True) linear_support_vector_classifier = svm.LinearSVC(dual=False) nearest_neighbor_classifier = KNeighborsClassifier() extra_trees_classifier = ExtraTreesClassifier(n_estimators=256)