def check_transformer_pickle(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    if not hasattr(transformer, 'transform'):
        return
    set_random_state(transformer)
    set_fast_parameters(transformer)

    # fit
    if name in CROSS_DECOMPOSITION:
        random_state = np.random.RandomState(seed=12345)
        y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit(X, y_).transform(X)
    pickled_transformer = pickle.dumps(transformer)
    unpickled_transformer = pickle.loads(pickled_transformer)
    pickled_X_pred = unpickled_transformer.transform(X)

    assert_array_almost_equal(pickled_X_pred, X_pred)
def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
def PCA_f(X, soglia):
    """Trasforma un insieme di dati tramite l'analisi delle componenti
    principali.
    Inputs:
    X -- l'insieme di dati sottoforma di DataFrame
    soglia -- la frazione di varianza spiegata minima dell'insieme
    di dati finale rispetto a quello iniziale
    Outputs:
    X_df_transformed -- DataFrame composto dalle componenti principali,
    il numero di componenti principali viene determinato dal parametro
    di soglia dato in input
    """
    X_std = StandardScaler().fit_transform(X)
    X_std = (X_std - X_std.min(axis=0)) / (X_std.max(axis=0) -
                                           X_std.min(axis=0))
    X = pd.DataFrame(data=X_std, index=None, columns=X.columns)
    pca = PCA(n_components=len(X.columns))  # max number of pca
    pca.fit(X)
    # Standardizing the features
    X_transformed = pca.transform(X)
    var_cumulata = np.array([
        pca.explained_variance_ratio_[:i].sum()
        for i in range(1, len(X.columns))
    ]).round(2)
    idx_ok = np.argmax(var_cumulata >= soglia)
    pca_names = []
    for i in range(idx_ok):
        tmp = ["PCA", str(i + 1)]
        pca_names.append(" ".join(tmp))
    X_df_transformed = pd.DataFrame(data=X_transformed[:, :idx_ok],
                                    index=None,
                                    columns=pca_names)
    return X_df_transformed
Beispiel #4
0
def check_classifiers_classes(name, classifier_orig):
    X_multiclass, y_multiclass = make_blobs(n_samples=30,
                                            random_state=0,
                                            cluster_std=0.1)
    X_multiclass, y_multiclass = shuffle(X_multiclass,
                                         y_multiclass,
                                         random_state=7)
    X_multiclass = StandardScaler().fit_transform(X_multiclass)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X_multiclass -= X_multiclass.min() - .1

    X_binary = X_multiclass[y_multiclass != 2]
    y_binary = y_multiclass[y_multiclass != 2]

    X_binary = pairwise_estimator_convert_X(X_binary, classifier_orig)

    labels_binary = ["one", "two"]

    y_names_binary = np.take(labels_binary, y_binary)

    for X, y, y_names in [(X_binary, y_binary, y_names_binary)]:
        for y_names_i in [y_names, y_names.astype('O')]:
            y_ = choose_check_classifiers_labels(name, y, y_names_i)
            check_classifiers_predictions(X, y_, name, classifier_orig)

    labels_binary = [-1, 1]
    y_names_binary = np.take(labels_binary, y_binary)
    y_binary = choose_check_classifiers_labels(name, y_binary, y_names_binary)
    check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)
Beispiel #5
0
def test_transformers_data_not_an_array():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1

    for name, Transformer in transformers:
        # XXX: some transformers are transforming the input
        # data. This is a bug that we'll fix later. Right now we copy
        # the data each time
        this_X = NotAnArray(X.copy())
        this_y = NotAnArray(np.asarray(y))
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            continue
        # And these wan't multivariate output
        if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
            continue
        yield check_transformer, name, Transformer, this_X, this_y
def check_transformer_general(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    _check_transformer(name, Transformer, X, y)
    _check_transformer(name, Transformer, X.tolist(), y.tolist())
Beispiel #7
0
def check_transformer_pickle(name, Transformer):
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    if not hasattr(transformer, 'transform'):
        return
    set_random_state(transformer)
    set_fast_parameters(transformer)

    # fit
    if name in CROSS_DECOMPOSITION:
        random_state = np.random.RandomState(seed=12345)
        y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit(X, y_).transform(X)
    pickled_transformer = pickle.dumps(transformer)
    unpickled_transformer = pickle.loads(pickled_transformer)
    pickled_X_pred = unpickled_transformer.transform(X)

    assert_array_almost_equal(pickled_X_pred, X_pred)
Beispiel #8
0
def check_transformer_general(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    _check_transformer(name, Transformer, X, y)
    _check_transformer(name, Transformer, X.tolist(), y.tolist())
Beispiel #9
0
def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
Beispiel #10
0
def test_transformers_pickle():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            transformer = Transformer()
        if not hasattr(transformer, 'transform'):
            continue
        set_random_state(transformer)
        if hasattr(transformer, 'compute_importances'):
            transformer.compute_importances = True

        if name == "SelectKBest":
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            transformer.k = 1
        elif name in ['GaussianRandomProjection', 'SparseRandomProjection']:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            transformer.n_components = 1

        # fit
        if name in ('PLSCanonical', 'PLSRegression', 'CCA',
                    'PLSSVD'):
            random_state = np.random.RandomState(seed=12345)
            y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        transformer.fit(X, y_)
        X_pred = transformer.fit(X, y_).transform(X)
        pickled_transformer = pickle.dumps(transformer)
        unpickled_transformer = pickle.loads(pickled_transformer)
        pickled_X_pred = unpickled_transformer.transform(X)

        try:
            assert_array_almost_equal(pickled_X_pred, X_pred)
        except Exception as exc:
            succeeded = False
            print ("Transformer %s doesn't predict the same value "
                   "after pickling" % name)
            raise exc

    assert_true(succeeded)
def check_transformer_data_not_an_array(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - 0.1
    this_X = NotAnArray(X)
    this_y = NotAnArray(np.asarray(y))
    _check_transformer(name, Transformer, this_X, this_y)
Beispiel #12
0
def analyze_regression(X, y, ax, cPoint, cSLine, metrics):
	try:
		_X = StandardScaler().fit_transform(X.values.reshape(-1, 1))
		_y = StandardScaler().fit_transform(y.values.reshape(-1, 1))

		_linear = LinearRegression().fit(_X, _y)
		_predict = _linear.predict(_X)
		ax.scatter(_X, _y, color = cPoint, marker = '.', alpha = .6, label = 'Dispersión')
		_pn, _px = _X.tolist().index(_X.min()), _X.tolist().index(_X.max())
		ax.plot((_X.min(), _X.max()), (_predict[_pn], _predict[_px]), color = cSLine, label = 'Recta de Regresión')

		ax.set_title('Regresión Lineal')
		ax.set_xticks(()); ax.set_yticks(())
		ax.legend()

		if metrics: _print_regressionMetrics(_linear, _X, _y, _predict)
	except Exception as e:
		return e
Beispiel #13
0
def check_transformer_data_not_an_array(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    this_X = NotAnArray(X)
    this_y = NotAnArray(np.asarray(y))
    _check_transformer(name, Transformer, this_X, this_y)
Beispiel #14
0
def qt_analysis(res):
    qt = res.drop(res.columns.intersection(['N2', 'dt', 'f']), axis=1)
    qt.hist()
    plt.show()
    qt_std = StandardScaler().fit_transform(qt)

    max = qt_std.max(0)
    min = qt_std.min(0)

    schew = sum(qt_std > 1) / qt_std.shape[0] + \
            sum(qt_std < -1) / qt_std.shape[0]

    return schew
Beispiel #15
0
def test_transformers_pickle():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter="transformer")
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        yield check_transformer_pickle, name, Transformer, X, y
Beispiel #16
0
def analyze_classification(X, y, ax, cContourf, cInlier, cOutlier, gamma, metrics):
	try:
		_X = StandardScaler().fit_transform(X.values.reshape(-1, 1))
		_y = StandardScaler().fit_transform(y.values.reshape(-1, 1))

		_padding = CONTOURF_CLASSIFICATION_MINING_PADDING
		_mesh_sted_size = CONTOURF_CLASSIFICATION_MINING_MESH_STEP_SIZE
		_X_min, _X_max = _X.min() - _padding, _X.max() + _padding
		_y_min, _y_max = _y.min() - _padding, _y.max() + _padding
		_mapx, _mapy = meshgrid(arange(_X_min, _X_max, _mesh_sted_size),
			arange(_y_min, _y_max, _mesh_sted_size))

		if gamma == 0: gamma = 'auto'

		_classifier = OneClassSVM(kernel = 'rbf', gamma = gamma,
			random_state = 0).fit(c_[_X, _y])

		_Z = _classifier.decision_function(c_[_mapx.ravel(), _mapy.ravel()])
		_predict = _classifier.predict(c_[_X, _y])

		ax.contourf(_mapx, _mapy, _Z.reshape(_mapx.shape), cmap = cContourf, alpha = .7)
		_sub_XIn, _sub_XOut = list(), list()
		_sub_yIn, _sub_yOut = list(), list()
		for i in range(_predict.size):
			if _predict[i] == 1:
				_sub_XIn.append(_X[i]); _sub_yIn.append(_y[i])
			else:
				_sub_XOut.append(_X[i]); _sub_yOut.append(_y[i])
		ax.scatter(_sub_XIn, _sub_yIn, c = cInlier, marker = '.', alpha = .6, label = 'Inliers')
		ax.scatter(_sub_XOut, _sub_yOut, c = cOutlier, marker = '.', alpha = .6, label = 'OutLiers')

		ax.set_title('SVM')
		ax.set_xticks(()); ax.set_yticks(())
		ax.legend()

		if metrics: _print_classificationMetrics(_classifier, _predict)
	except Exception as e:
		return e
Beispiel #17
0
def test_32_64_decomposition_shape():
    """ Test that the decomposition is similar for 32 and 64 bits data """
    # see https://github.com/scikit-learn/scikit-learn/issues/18146
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    # Compare the shapes (corresponds to the number of non-zero eigenvalues)
    kpca = KernelPCA()
    assert (kpca.fit_transform(X).shape == kpca.fit_transform(
        X.astype(np.float32)).shape)
Beispiel #18
0
def test_transformers_pickle():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        yield check_transformer_pickle, name, Transformer, X, y
def check_transformer(name, Transformer):
    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
        # Those transformers yield non-deterministic output when executed on
        # a 32bit Python. The same transformers are stable on 64bit Python.
        # FIXME: try to isolate a minimalistic reproduction case only depending
        # on numpy & scipy and/or maybe generate a test dataset that does not
        # cause such unstable behaviors.
        msg = name + ' is non deterministic on 32bit Python'
        raise SkipTest(msg)

    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    _check_transformer(name, Transformer, X, y)
Beispiel #20
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter="transformer")
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ["AdditiveChi2Sampler", "Binarizer", "Normalizer"]:
            continue
        yield check_transformer, name, Transformer, X, y
Beispiel #21
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            continue
        yield check_transformer, name, Transformer, X, y
def check_transformer_pickle(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    if not hasattr(transformer, 'transform'):
        return
    set_random_state(transformer)
    if hasattr(transformer, 'compute_importances'):
        transformer.compute_importances = True

    if name == "SelectKBest":
        # SelectKBest has a default of k=10
        # which is more feature than we have.
        transformer.k = 1
    elif name in ['GaussianRandomProjection', 'SparseRandomProjection']:
        # Due to the jl lemma and very few samples, the number
        # of components of the random matrix projection will be greater
        # than the number of features.
        # So we impose a smaller number (avoid "auto" mode)
        transformer.n_components = 1

    if "n_iter" in transformer.get_params():
        # speed up some estimators
        transformer.set_params(n_iter=5)

    # fit
    if name in CROSS_DECOMPOSITION:
        random_state = np.random.RandomState(seed=12345)
        y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit(X, y_).transform(X)
    pickled_transformer = pickle.dumps(transformer)
    unpickled_transformer = pickle.loads(pickled_transformer)
    pickled_X_pred = unpickled_transformer.transform(X)

    assert_array_almost_equal(pickled_X_pred, X_pred)
def check_transformer(name, Transformer):
    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
        # Those transformers yield non-deterministic output when executed on
        # a 32bit Python. The same transformers are stable on 64bit Python.
        # FIXME: try to isolate a minimalistic reproduction case only depending
        # on numpy & scipy and/or maybe generate a test dataset that does not
        # cause such unstable behaviors.
        msg = name + ' is non deterministic on 32bit Python'
        raise SkipTest(msg)

    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    _check_transformer(name, Transformer, X, y)
    _check_transformer(name, Transformer, X.tolist(), y.tolist())
Beispiel #24
0
def standardize(array, name):
    """Recieves a dataFrame or Series (from pandas) and returns a numpy array with zero mean and unit variance."""
    # Transform to numpy array
    nparray = array.as_matrix().reshape(array.shape[0],1).astype('float32')
    print('------------')
    print(name)
    print('Different values before:', np.unique(nparray).shape[0])

    # Standardize the data
    nparray = StandardScaler().fit_transform(nparray)

    # Print some information
    print('Mean:', nparray.mean())
    print('Max:', nparray.max())
    print('Min:', nparray.min())
    print('Std:', nparray.std())
    print('Different values after:', np.unique(nparray).shape[0])

    return nparray
Beispiel #25
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Trans in transformers:
        trans = None

        if name in dont_test:
            continue
        # these don't actually fit the data:
        if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            trans = Trans()
        set_random_state(trans)
        if hasattr(trans, 'compute_importances'):
            trans.compute_importances = True

        if Trans is SelectKBest:
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            trans.k = 1
        elif Trans in [GaussianRandomProjection, SparseRandomProjection]:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            trans.n_components = 1

        # fit

        if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
            random_state = np.random.RandomState(seed=12345)
            y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        try:
            trans.fit(X, y_)
            X_pred = trans.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print trans
            print e
            print
            succeeded = False
            continue

        if hasattr(trans, 'transform'):
            if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
                X_pred2 = trans.transform(X, y_)
                X_pred3 = trans.fit_transform(X, y=y_)
            else:
                X_pred2 = trans.transform(X)
                X_pred3 = trans.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                    assert_array_almost_equal(
                        x_pred, x_pred2, 2,
                        "fit_transform not correct in %s" % Trans)
                    assert_array_almost_equal(
                        x_pred3, x_pred2, 2,
                        "fit_transform not correct in %s" % Trans)
            else:
                assert_array_almost_equal(
                    X_pred, X_pred2, 2,
                    "fit_transform not correct in %s" % Trans)
                assert_array_almost_equal(
                    X_pred3, X_pred2, 2,
                    "fit_transform not correct in %s" % Trans)

            # raises error on malformed input for transform
            assert_raises(ValueError, trans.transform, X.T)
    assert_true(succeeded)
Beispiel #26
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    transformers = [(name, E) for name, E in estimators
                    if issubclass(E, TransformerMixin)]
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Trans in transformers:
        if Trans in dont_test or Trans in meta_estimators:
            continue
        # these don't actually fit the data:
        if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            trans = Trans()

        if hasattr(trans, 'compute_importances'):
            trans.compute_importances = True

        if Trans is SelectKBest:
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            trans.k = 1

        # fit

        if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
            y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        try:
            trans.fit(X, y_)
            X_pred = trans.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print trans
            print e
            print
            succeeded = False

        if hasattr(trans, 'transform'):
            if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
                X_pred2 = trans.transform(X, y_)
            else:
                X_pred2 = trans.transform(X)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2 in zip(X_pred, X_pred2):
                    assert_array_almost_equal(
                        x_pred, x_pred2, 2,
                        "fit_transform not correct in %s" % Trans)
            else:
                assert_array_almost_equal(
                    X_pred, X_pred2, 2,
                    "fit_transform not correct in %s" % Trans)

            # raises error on malformed input for transform
            assert_raises(ValueError, trans.transform, X.T)
    assert_true(succeeded)
Beispiel #27
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    transformers = [(name, E) for name, E in estimators if issubclass(E,
        TransformerMixin)]
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
            random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Trans in transformers:
        if Trans in dont_test or Trans in meta_estimators:
            continue
        # these don't actually fit the data:
        if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            trans = Trans()
        set_random_state(trans)
        if hasattr(trans, 'compute_importances'):
            trans.compute_importances = True

        if Trans is SelectKBest:
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            trans.k = 1

        # fit

        if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
            y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        try:
            trans.fit(X, y_)
            X_pred = trans.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print trans
            print e
            print
            succeeded = False
            continue

        if hasattr(trans, 'transform'):
            if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
                X_pred2 = trans.transform(X, y_)
            else:
                X_pred2 = trans.transform(X)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2 in zip(X_pred, X_pred2):
                    assert_array_almost_equal(x_pred, x_pred2, 2,
                        "fit_transform not correct in %s" % Trans)
            else:
                assert_array_almost_equal(X_pred, X_pred2, 2,
                    "fit_transform not correct in %s" % Trans)

            # raises error on malformed input for transform
            assert_raises(ValueError, trans.transform, X.T)
    assert_true(succeeded)
        final_df.loc[indicesToKeep, "LDA 2"],
        c=color,
        s=50,
    )
ax.legend(targets)
ax.grid()
# -

# <a name="3-3"></a>
# ## 3.3 Non-Negative Matrix Factorization (NMF)
# Source: [Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html)
#
# Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction.

print(X.max())
print(X.min())

# In order to use NMF, our data cannot contain negative values. For that reason, we wil use `MinMaxScaler` from sklearn which scales the data in a given range. For example, range (0,1).
#
# `MinMaxScaler` is equivalent to the code below:
#
# ```python
# X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
# X_scaled = X_std * (max - min) + min
# ```

# +
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
    bad_inds=np.where(totalPVs>bad_perc)[0]
    bad_inds1=np.where(totalPVs<=5)[0]
    bad_inds=np.union1d(bad_inds,bad_inds1)

    very_active_inds=np.setdiff1d(va_inds,bad_inds)
    print(va_inds.shape,bad_inds.shape,very_active_inds.shape)

    featMatrix=featMatrix[very_active_inds,:]
    print('Teenagers',featMatrix.sum(axis=0))


    featMatrixNormalized=Normalizer(norm='l2').fit_transform(featMatrix)
    featMatrixSTD=StandardScaler().fit_transform(featMatrix)
    featMatrixSTD=featMatrixSTD#+np.abs(featMatrixSTD.min())+1.e-15
    print(featMatrixSTD.min())
    #featMatrix=RobustScaler(with_centering=False).fit_transform(featMatrix)

    nmfTrf=TruncatedSVD(n_components=10)
    nmfFeats=nmfTrf.fit_transform(featMatrixSTD)
    dfTest=paDataFrame(featMatrixSTD[:,:10])

    corr=np.dot(featMatrix,featMatrix.T)
    print(corr.shape)

    bandwidth = estimate_bandwidth(featMatrix, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth*0.7, bin_seeding=True)
    print('bandwidth',bandwidth)
    labels=ms.fit_predict(featMatrix)

Beispiel #30
0
    QuadraticDiscriminantAnalysis()
]

X = data_pca_tsne

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets

# preprocess dataset, split into training and test part

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.4, random_state=42)

x_min, x_max = X.min() - .5, X.max() + .5
y_min, y_max = y.min() - .5, y.max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# just plot the dataset first
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0],
           X_train[:, 1],
           c=y_train,
           cmap=cm_bright,
           edgecolors='k')
# and testing points
Beispiel #31
0
def residual_plot(model, X, Y):
    """This function plots residual-plot for a regressor.
	
	X, y : np.ndarray
	model : estimator object. Should have 'fit' and 'predict' methods.
	"""
    x_train, x_test, y_train, y_test = split_data(X, Y)
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    res_train = y_train - y_pred_train
    res_test = y_test - y_pred_test
    
    fig, [ax0, ax1] = plt.subplots(2, 1, figsize=(14, 10))
    tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
                 (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
                 (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
                 (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
                 (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
    tableau20 = [(i[0]/255., i[1]/255., i[2]/255.) for i in tableau20]
    %matplotlib inline
    
    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02
    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    ################################
    # Plot res-plot for training set
    x = StandardScaler().fit_transform(y_pred_train.reshape(-1, 1))
    y = StandardScaler().fit_transform(res_train.reshape(-1, 1))
    fig1 = plt.figure(figsize=(14, 10))
    fig1.suptitle('Residual plot for training set')
    
    # start with a rectangular Figure
    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)
    
    # the scatter plot:
    axScatter.scatter(x, y, color=tableau20[0], alpha=0.5)
    
    # now determine nice limits by hand:
    n_bins = 100

    x_limp = x.max() + x.std()
    x_limn = x.min() - x.std()
    y_limp = y.max() + y.std()
    y_limn = y.min() - y.std()

    axScatter.set_xlim((x_limn, x_limp))
    axScatter.set_ylim((y_limn, y_limp))
    axScatter.set_xlabel('Estimated output')
    axScatter.set_ylabel('Residuals')

    axHistx.hist(x, bins=n_bins, color=tableau20[1], alpha=0.75)
    axHisty.hist(y, bins=n_bins, orientation='horizontal', color=tableau20[2], alpha=0.75)

    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())
    
    
    ################################
    # Plot res-plot for testing set
    x = StandardScaler().fit_transform(y_pred_test.reshape(-1, 1))
    y = StandardScaler().fit_transform(res_test.reshape(-1, 1))
    fig2 = plt.figure(figsize=(14, 10))
    fig2.suptitle('Residual plot for testing set')
    
    # start with a rectangular Figure
    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)
    
    # the scatter plot:
    axScatter.scatter(x, y, color=tableau20[0], alpha=0.5)
    
    # now determine nice limits by hand:
    n_bins = 100

    x_limp = x.max() + x.std()
    x_limn = x.min() - x.std()
    y_limp = y.max() + y.std()
    y_limn = y.min() - y.std()

    axScatter.set_xlim((x_limn, x_limp))
    axScatter.set_ylim((y_limn, y_limp))
    axScatter.set_xlabel('Estimated output')
    axScatter.set_ylabel('Residuals')

    axHistx.hist(x, bins=n_bins, color=tableau20[1], alpha=0.75)
    axHisty.hist(y, bins=n_bins, orientation='horizontal', color=tableau20[2], alpha=0.75)

    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())
    plt.show()
Beispiel #32
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            transformer = Transformer()
        set_random_state(transformer)
        if hasattr(transformer, 'compute_importances'):
            transformer.compute_importances = True

        if name == 'SelectKBest':
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            transformer.k = 1
        elif name in ['GaussianRandomProjection', 'SparseRandomProjection']:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            transformer.n_components = 1
        elif name == "MiniBatchDictionaryLearning":
            transformer.set_params(n_iter=5)  # default = 1000

        elif name == "KernelPCA":
            transformer.remove_zero_eig = False

        # fit

        if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
            y_ = np.c_[y, y]
            y_[::2, 1] *= 2
        else:
            y_ = y

        try:
            transformer.fit(X, y_)
            X_pred = transformer.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print(transformer)
            print(e)
            print()
            succeeded = False
            continue

        if hasattr(transformer, 'transform'):
            if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
                X_pred2 = transformer.transform(X, y_)
                X_pred3 = transformer.fit_transform(X, y=y_)
            else:
                X_pred2 = transformer.transform(X)
                X_pred3 = transformer.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                    assert_array_almost_equal(
                        x_pred, x_pred2, 2,
                        "fit_transform not correct in %s" % Transformer)
                    assert_array_almost_equal(
                        x_pred3, x_pred2, 2,
                        "fit_transform not correct in %s" % Transformer)
            else:
                assert_array_almost_equal(
                    X_pred, X_pred2, 2,
                    "fit_transform not correct in %s" % Transformer)
                assert_array_almost_equal(
                    X_pred3, X_pred2, 2,
                    "fit_transform not correct in %s" % Transformer)

            # raises error on malformed input for transform
            assert_raises(ValueError, transformer.transform, X.T)
    assert_true(succeeded)
Beispiel #33
0
def heatmap(x, row_header, column_header, row_method,column_method, row_metric, column_metric,color_gradient, html_folder):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        """
        This below code is based in large part on the protype methods:
        http://old.nabble.com/How-to-plot-heatmap-with-matplotlib--td32534593.html
        http://stackoverflow.com/questions/7664826/how-to-get-flat-clustering-corresponding-to-color-clusters-in-the-dendrogram-cre
        x is an m by n ndarray, m observations, n genes
        """
        
        ### Define the color gradient to use based on the provided name
        n = len(x[0]); m = len(x)
        if color_gradient == 'red_white_blue':
            cmap=plt.cm.bwr
        if color_gradient == 'red_black_sky':
            cmap=RedBlackSkyBlue()
        if color_gradient == 'red_black_blue':
            cmap=RedBlackBlue()
        if color_gradient == 'red_black_green':
            cmap=RedBlackGreen()
        if color_gradient == 'yellow_black_blue':
            cmap=YellowBlackBlue()
        if color_gradient == 'seismic':
            cmap=plt.cm.seismic
        if color_gradient == 'green_white_purple':
            cmap=plt.cm.PiYG_r
        if color_gradient == 'coolwarm':
            cmap=plt.cm.coolwarm

        ### Scale the max and min colors so that 0 is white/black
        x = StandardScaler().fit_transform(x)
        vmin = x.min()
        vmax = x.max()
        vmax = max([vmax,abs(vmin)])
        vmin = vmax*-1
        norm = mpl.colors.Normalize(vmin, vmax) ### adjust the max and min to scale these colors

        ### Scale the Matplotlib window size
        default_window_height = 7
        default_window_width = 11
        fig = plt.figure(figsize=(default_window_width,default_window_height)) ### could use m,n to scale here
        color_bar_w = 0.015 ### Sufficient size to show
            
        ## calculate positions for all elements
        # ax1, placement of dendrogram 1, on the left of the heatmap
        [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05,0.22,0.2,0.6]   ### The second value controls the position of the matrix relative to the bottom of the view
        width_between_ax1_axr = -0.004
        height_between_ax1_axc = -0.004 ### distance between the top color bar axis and the matrix
        
        # axr, placement of row side colorbar
        [axr_x, axr_y, axr_w, axr_h] = [0.31,0.1,color_bar_w,0.6] ### second to last controls the width of the side color bar - 0.015 when showing
        axr_x = ax1_x + ax1_w + width_between_ax1_axr
        axr_y = ax1_y; axr_h = ax1_h
        width_between_axr_axm = -0.004

        # axc, placement of column side colorbar
        [axc_x, axc_y, axc_w, axc_h] = [0.4,0.63,0.5,color_bar_w] ### last one controls the height of the top color bar - 0.015 when showing
        axc_x = axr_x + axr_w + width_between_axr_axm
        axc_y = ax1_y + ax1_h + height_between_ax1_axc
        height_between_axc_ax2 = -0.004

        # axm, placement of heatmap for the data matrix
        [axm_x, axm_y, axm_w, axm_h] = [0.4,0.9,2.5,0.5]
        axm_x = axr_x + axr_w + width_between_axr_axm
        axm_y = ax1_y; axm_h = ax1_h
        axm_w = axc_w

        # ax2, placement of dendrogram 2, on the top of the heatmap
        [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3,0.72,0.6,0.15] ### last one controls height of the dendrogram
        ax2_x = axr_x + axr_w + width_between_axr_axm
        ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
        ax2_w = axc_w

        # axcb - placement of the color legend
        [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07,0.88,0.18,0.07]

        # Compute and plot top dendrogram
        if column_method != None:
            d2 = dist.pdist(x.T)
            D2 = dist.squareform(d2)
            ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=False)
            Y2 = sch.linkage(D2, method=column_method, metric=column_metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete'
            Z2 = sch.dendrogram(Y2)
            ind2 = sch.fcluster(Y2,0.7*max(Y2[:,2]),'distance') ### This is the default behavior of dendrogram
            ax2.set_xticks([]) ### Hides ticks
            ax2.set_yticks([])
        else:
            ind2 = ['NA']*len(column_header) ### Used for exporting the flat cluster data
            
        # Compute and plot left dendrogram.
        if row_method != None:
            d1 = dist.pdist(x)
            D1 = dist.squareform(d1)  # full matrix
            ax1 = fig.add_axes([ax1_x+0.005, ax1_y, ax1_w, ax1_h], frame_on=False) # frame_on may be False
            Y1 = sch.linkage(D1, method=row_method, metric=row_metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete'
            Z1 = sch.dendrogram(Y1, orientation='right')
            ind1 = sch.fcluster(Y1,0.7*max(Y1[:,2]),'distance') ### This is the default behavior of dendrogram
            ax1.set_xticks([]) ### Hides ticks
            ax1.set_yticks([])
        else:
            ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data
            
        # Plot distance matrix.
        axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h])  # axes for the data matrix
        xt = x
        if column_method != None:
            idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data
            xt = xt[:,idx2]
            # ind2 = ind2[:,idx2] ### reorder the flat cluster to match the order of the leaves the dendrogram
        if row_method != None:
            idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data
            xt = xt[idx1,:]   # xt is transformed x
            # ind1 = ind1[idx1,:] ### reorder the flat cluster to match the order of the leaves the dendrogram
        ### taken from http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python/3011894#3011894
        im = axm.matshow(xt, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black
        axm.set_xticks([]) ### Hides x-ticks
        axm.set_yticks([])

        # Add text
        new_row_header=[]
        new_column_header=[]
        for i in range(x.shape[0]):
            if row_method != None:
                if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows
                    if len(row_header) < 20:
                        fontsize=15
                    else:
                        fontsize=200/len(row_header)
                    axm.text(x.shape[1]-0.5, i-0.1, '  '+row_header[idx1[i]],fontsize=fontsize)
                new_row_header.append(row_header[idx1[i]])
            else:
                if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows
                    if len(row_header) < 20:
                        fontsize=8
                    else:
                        fontsize=200/len(row_header)
                    axm.text(x.shape[1]-0.5, i-0.1, '  '+row_header[i],fontsize=fontsize) ### When not clustering rows
                new_row_header.append(row_header[i])
        for i in range(x.shape[1]):
            if column_method != None:
                axm.text(i, -0.55, ' '+column_header[idx2[i]], rotation=315, verticalalignment="top") # rotation could also be degrees
                new_column_header.append(column_header[idx2[i]])
            else: ### When not clustering columns
                axm.text(i, -0.55, ' '+column_header[i], rotation=315, verticalalignment="top")
                new_column_header.append(column_header[i])

        # Plot colside colors
        # axc --> axes for column side colorbar
        # if column_method != None:
        #     axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h])  # axes for column side colorbar
        #     cmap_c = mpl.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
        #     dc = np.array(ind2, dtype=int)
        #     dc.shape = (1,len(ind2)) 
        #     im_c = axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c)
        #     axc.set_xticks([]) ### Hides ticks
        #     axc.set_yticks([])
        
        # Plot rowside colors
        # axr --> axes for row side colorbar
        # if row_method != None:
        #     axr = fig.add_axes([axr_x+0.005, axr_y, axr_w-0.005, axr_h])  # axes for column side colorbar
        #     dr = np.array(ind1, dtype=int)
        #     dr.shape = (len(ind1),1)
        #     cmap_r = mpl.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
        #     im_r = axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r)
        #     axr.set_xticks([]) ### Hides ticks
        #     axr.set_yticks([])

        # Plot color legend
        axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False)  # axes for colorbar
        cb = mpl.colorbar.ColorbarBase(axcb, cmap=cmap, norm=norm, orientation='horizontal')
        cb.set_ticks([vmin,0,vmax])
        axcb.set_title("Normalized Expression")

        #Save figures
        plt.savefig(os.path.join(html_folder,'Heatmap.png'), dpi=300)
        plt.savefig(os.path.join(html_folder,'Heatmap.svg'))

        #Create html output file
        html_file = """<!DOCTYPE html>
<html>
    <head>
        <title>Heatmap</title>
        <style>
        * {
            font-family:Arial, Helvetica, sans-serif;
            }
        </style>
    </head>
    <body style="width:100%">
        <img style="width:100%" src="Heatmap.png" alt="Heatmap">
        <a href="index.html"><b>Back</b></a>
    </body>
</html>
"""
        with open(os.path.join(html_folder,'Heatmap.html'),'w') as F:
            F.write(html_file)
    new_rows_list = []
    for row in csv_reader:
        row[1] = labels[i]
        i = i + 1
        new_rows_list.append(row)
#for data in new_rows_list:
#  data[i][1]=''.join(labels[i])
# i = i+1
print(new_rows_list)
with open('Crops_MIR.csv', 'w', newline='') as write_csv:
    csv_writer = csv.writer(write_csv)
    csv_writer.writerows(new_rows_list)
cols = ['Name', 'Labels', 'Water Require', 'Temp', 'Moisture', 'Production']
data = pd.read_csv(r'Crops_MIR.csv', names=cols)
y = data['Labels']
X_norm = (X - X.min()) / (X.max() - X.min())
lda = LDA(n_components=1)
lda_transformed = pd.DataFrame(lda.fit_transform(X_norm, y))
#print(lda_transformed)
for i in range(3):
    plt.scatter(lda_transformed[y == i],
                data[y == i]['Water Require'],
                color=colmap[i])
plt.show()
#min_required = min(data[y==2]['Water Require'])

#Get Current Data from Farm to predict list of next possible crops
blob = bucket.get_blob("Current_Data.csv")
blob.download_to_filename("currentcropdata.csv")
cols1 = ['Humidity', 'Temperature', 'Distance', 'Moisture']
current_data = pd.read_csv(r'Crops_MIR.csv', names=cols1)
Beispiel #35
0
print('Importing and processing data.')
df = pd.read_table('u.data',
                   names=col_names,
                   usecols=col_names[0:3],
                   dtype=np.int32)

# Process ratings and save to file
user_ratings = np.zeros([N, M])
for i in range(0, N):  # foreach user in dataset
    # foreach rating of a unique user, centre and normalise data
    u = df.loc[df['user id'] == (i + 1)]
    temp = np.array([k for j, k in zip(u['movie id'], u['rating'])])
    temp = temp.reshape(-1, 1)
    temp = StandardScaler(with_std=False).fit_transform(X=temp)
    temp = temp.reshape(len(temp))
    min_r = temp.min()
    max_r = temp.max()
    x = 0
    for j, k in zip(u['movie id'], u['rating']):
        # store (existing) ratings in array row, filling empty cells with 0
        user_ratings[i, (j - 1)] = np.interp(temp[x], [min_r, max_r], [0, 1])
        x += 1
np.save('user_ratings.npy', user_ratings)
df.drop(columns=col_names[0:3])

# Process user ids
# one-hot encoded set of ids equals NxN identity matrix
user_ids = np.identity(N, dtype=np.int32)
np.save('user_ids.npy', user_ids)

# Split train data into 5 folds and save results to file
Beispiel #36
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Trans in transformers:
        trans = None

        if Trans in dont_test:
            continue
        # these don't actually fit the data:
        if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            trans = Trans()
        set_random_state(trans)
        if hasattr(trans, 'compute_importances'):
            trans.compute_importances = True

        if Trans is SelectKBest:
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            trans.k = 1
        elif Trans in [GaussianRandomProjection,
                       SparseRandomProjection]:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            trans.n_components = 1

        # fit

        if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
            random_state = np.random.RandomState(seed=12345)
            y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        try:
            trans.fit(X, y_)
            X_pred = trans.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print trans
            print e
            print
            succeeded = False
            continue

        if hasattr(trans, 'transform'):
            if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD):
                X_pred2 = trans.transform(X, y_)
            else:
                X_pred2 = trans.transform(X)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2 in zip(X_pred, X_pred2):
                    assert_array_almost_equal(
                        x_pred, x_pred2, 2,
                        "fit_transform not correct in %s" % Trans)
            else:
                assert_array_almost_equal(
                    X_pred, X_pred2, 2,
                    "fit_transform not correct in %s" % Trans)

            # raises error on malformed input for transform
            assert_raises(ValueError, trans.transform, X.T)
    assert_true(succeeded)
Beispiel #37
0
                 ])

print(df.head())
print(df.describe())

from sklearn.preprocessing import StandardScaler
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:, ['target']].values  # Access column by name
# Standardizing the features
x = StandardScaler().fit_transform(x)

print(x[:5, ])
print(x.min(axis=0))
print(x.max(axis=0))

#####
# PCA Projection to 2D
#####

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(
    data=principalComponents,
    columns=['principal component 1', 'principal component 2'])

finalDf = pd.concat([principalDf, df[['target']]], axis=1)
print(finalDf.head())
def Plot_Decision_Boundaries_2D(X1,
                                X2,
                                y,
                                Estimators,
                                Test_Size=0.3,
                                Random_State=None,
                                Scale=True,
                                Colour_Map=plt.cm.coolwarm,
                                Bright_Colour_Map=plt.cm.afmhot,
                                Alpha_Train=1,
                                Alpha_Test=0.6,
                                Certainty_Threshold=None,
                                Variable_Names=("Variable1", "Variable2"),
                                Delta=0.02):
    def Return_Most_Certain_Classification_Data(X,
                                                y,
                                                Model,
                                                Certainty_Thresh=0,
                                                Fit_First=False):

        if Fit_First:
            Model = Model.fit(X, y)
        if hasattr(Model, "predict_proba"):
            probabilities = Model.predict_proba(X)
        elif hasattr(Model, "decision_function"):
            probabilities = Model.decision_function(X)
        certainty_bool = np.amax(probabilities, axis=1) > Certainty_Thresh

        certain_predictors, certain_response = X[certainty_bool], y[
            certainty_bool]
        print("Old number of samples:", len(y))
        print("New number of samples:", len(certain_response))

        return certain_predictors, certain_response

    if Certainty_Threshold != None:
        X_Combined = np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1)))
        X, y = Return_Most_Certain_Classification_Data(
            X_Combined,
            y,
            Model=Estimator,
            Certainty_Thresh=Certainty_Threshold,
            Fit_First=True)
        X1, X2 = X[:, 0], X[:, 1]

    #Define a class bijection for class colour mapping
    unique_classes, y_bijection = np.unique(y, return_inverse=True)

    #Sort the data so colour labels match up with actual labels
    X1, X2 = X1.reshape((-1, 1)), X2.reshape((-1, 1))
    y_bijection = y_bijection.reshape((-1, 1))

    Full_combined = np.hstack((X1, X2, y_bijection))
    Full_combined = Full_combined[Full_combined[:, 2].argsort()]

    X1, X2 = Full_combined[:, 0].reshape((-1, 1)), Full_combined[:, 1].reshape(
        (-1, 1))
    y_bijection = Full_combined[:, 2].reshape((-1, 1))

    #Preprocess the data if needed:
    X1, X2 = StandardScaler().fit_transform(
        X1), StandardScaler().fit_transform(X2)

    delta = Delta  #Step size in the mesh

    figure = plt.figure(figsize=(12, 8))

    x1_min, x1_max = X1.min() - 0.5, X1.max() + 0.5
    x2_min, x2_max = X2.min() - 0.5, X2.max() + 0.5

    xx, yy = np.meshgrid(np.arange(x1_min, x1_max, delta),
                         np.arange(x2_min, x2_max, delta))

    #Plot the given data (colourmap)

    col_map = Colour_Map
    col_map_bright = Bright_Colour_Map

    #Ready a train test split
    Full_combined = np.hstack((X1, X2, y_bijection))

    X_train, X_test, y_train, y_test = train_test_split(
        Full_combined[:, [0, 1]],
        Full_combined[:, 2],
        test_size=Test_Size,
        random_state=Random_State)

    #Get a figure and axes based on how many estimators (1 or multiple there are)
    #Multiple estimators
    if isinstance(Estimators, (list, type(np.array))):
        n_rows = len(Estimators)

        fig, axes = plt.subplots(nrows=n_rows,
                                 ncols=2,
                                 sharex=True,
                                 sharey=True,
                                 figsize=(12, n_rows * 4))
    #One estimator
    else:
        Estimators = np.array([Estimators])
        fig, axes = plt.subplots(1, 2, figsize=(12, 8))
        axes = np.array([axes])

    for axs, Estimator in zip(axes[:], Estimators):

        ax1, ax2 = axs[0], axs[1]

        ax1.set_title("Input Data")
        #Plot Training data
        scat = ax1.scatter(X_train[:, 0],
                           X_train[:, 1],
                           c=y_train,
                           cmap=col_map_bright,
                           edgecolors='k',
                           alpha=Alpha_Train)
        #And testing data
        ax1.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c=y_test,
                    cmap=col_map_bright,
                    edgecolors='k',
                    alpha=Alpha_Test)

        ax1.set_xlim(xx.min(), xx.max())
        ax1.set_ylim(yy.min(), yy.max())

        ax1.set_xlabel(Variable_Names[0])
        ax1.set_ylabel(Variable_Names[1])

        #Now for the classifier

        model = Estimator.fit(X_train, y_train)
        score = model.score(X_test, y_test)

        #Plot the decision boundary. For that, we will assign a colour to each point
        # in the mesh [x1_min, x1_max]*[x2_min, x2_max]

        if hasattr(model, "decision_function"):
            Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])

        elif hasattr(model, "predict_proba"):
            Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])

        else:
            print(
                "This Estimator doesn't have a decision_function attribute and can't predict probabilities"
            )

        Z = np.argmax(Z, axis=1)
        Z_uniques = np.unique(Z)

        unique_predictions = unique_classes[Z_uniques]

        #Put the result in a colourplot

        Z = Z.reshape(xx.shape)

        contour = ax2.pcolormesh(xx,
                                 yy,
                                 Z,
                                 vmin=Z.min(),
                                 vmax=Z.max(),
                                 cmap=col_map,
                                 alpha=0.7)

        #Plot also the training data
        ax2.scatter(X_train[:, 0],
                    X_train[:, 1],
                    c=y_train,
                    cmap=col_map_bright,
                    edgecolors='k',
                    alpha=Alpha_Train)
        #And testing data
        ax2.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c=y_test,
                    cmap=col_map_bright,
                    edgecolors='k',
                    alpha=Alpha_Test)

        ax2.set_xlim(xx.min(), xx.max())
        ax2.set_ylim(yy.min(), yy.max())

        ax2.set_xlabel(Variable_Names[0])
        ax2.set_ylabel(Variable_Names[1])
        ax2.set_title(str(Estimator))

        ax2.text(xx.max() - .3,
                 yy.min() + .3, ('%.2f' % score).lstrip('0'),
                 size=15,
                 horizontalalignment='right')

        cb1 = plt.colorbar(scat,
                           spacing="proportional",
                           ax=ax1,
                           ticks=np.arange(len(unique_classes)))
        cb1.ax.set_yticklabels(unique_classes)

        print("Unique Predictions: {}".format(unique_classes[Z_uniques]),
              "for: {}".format(Estimator))

        ticks = np.linspace(Z.min(), Z.max(), len(unique_predictions))

        cb2 = plt.colorbar(contour,
                           spacing="proportional",
                           ax=ax2,
                           ticks=ticks)
        cb2.ax.set_yticklabels(unique_predictions)

        #Also print the score of the model
        print("Model Score:", score, "\n")

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.suptitle("Data and Classification Boundaries", fontsize=20)

    return fig
Beispiel #39
0
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

for i in range(2):
    fig, axes = plt.subplots(3, 6)
    axes = axes.ravel()
    for j in range(len(axes)):
        feature = StandardScaler().fit_transform(X_train[y_train == i,
                                                         j:j + 1])
        hist = axes[j].hist(feature,
                            bins='auto',
                            histtype='step',
                            linewidth=2,
                            density=True)
        grid = np.linspace(feature.min(), feature.max(), num=1000)
        log_density = (GaussianMixture(
            n_components=10,
            reg_covar=0.03).fit(feature).score_samples(grid[:, None]))
        gmm = axes[j].plot(grid, np.exp(log_density))
        axes[j].set_title(f'var_{j}', **title_config)
        axes[j].set_ylim([0, 1])
    fig.suptitle(f'Histogram vs Gaussian Mixture Model for Class {i}',
                 **title_config)
    fig.legend((hist[2][0], gmm[0]), ('Histogram', 'Gaussian mixture model'),
               loc='upper center',
               bbox_to_anchor=(0.5, 1),
               ncol=2,
               fontsize=14)
    fig.tight_layout()
    fig.subplots_adjust(top=0.88)
Beispiel #40
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter="transformer")
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ["AdditiveChi2Sampler", "Binarizer", "Normalizer"]:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            transformer = Transformer()
        set_random_state(transformer)
        if hasattr(transformer, "compute_importances"):
            transformer.compute_importances = True

        if name == "SelectKBest":
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            transformer.k = 1
        elif name in ["GaussianRandomProjection", "SparseRandomProjection"]:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            transformer.n_components = 1
        elif name == "MiniBatchDictionaryLearning":
            transformer.set_params(n_iter=5)  # default = 1000

        elif name == "KernelPCA":
            transformer.remove_zero_eig = False

        # fit

        if name in ("PLSCanonical", "PLSRegression", "CCA", "PLSSVD"):
            y_ = np.c_[y, y]
            y_[::2, 1] *= 2
        else:
            y_ = y

        try:
            transformer.fit(X, y_)
            X_pred = transformer.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print(transformer)
            print(e)
            print()
            succeeded = False
            continue

        if hasattr(transformer, "transform"):
            if name in ("PLSCanonical", "PLSRegression", "CCA", "PLSSVD"):
                X_pred2 = transformer.transform(X, y_)
                X_pred3 = transformer.fit_transform(X, y=y_)
            else:
                X_pred2 = transformer.transform(X)
                X_pred3 = transformer.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                    assert_array_almost_equal(x_pred, x_pred2, 2, "fit_transform not correct in %s" % Transformer)
                    assert_array_almost_equal(x_pred3, x_pred2, 2, "fit_transform not correct in %s" % Transformer)
            else:
                assert_array_almost_equal(X_pred, X_pred2, 2, "fit_transform not correct in %s" % Transformer)
                assert_array_almost_equal(X_pred3, X_pred2, 2, "fit_transform not correct in %s" % Transformer)

            # raises error on malformed input for transform
            assert_raises(ValueError, transformer.transform, X.T)
    assert_true(succeeded)
    print(bad_perc, bad_perc1)

    bad_inds = np.where(totalPVs > bad_perc)[0]
    bad_inds1 = np.where(totalPVs <= 5)[0]
    bad_inds = np.union1d(bad_inds, bad_inds1)

    very_active_inds = np.setdiff1d(va_inds, bad_inds)
    print(va_inds.shape, bad_inds.shape, very_active_inds.shape)

    featMatrix = featMatrix[very_active_inds, :]
    print('Teenagers', featMatrix.sum(axis=0))

    featMatrixNormalized = Normalizer(norm='l2').fit_transform(featMatrix)
    featMatrixSTD = StandardScaler().fit_transform(featMatrix)
    featMatrixSTD = featMatrixSTD  #+np.abs(featMatrixSTD.min())+1.e-15
    print(featMatrixSTD.min())
    #featMatrix=RobustScaler(with_centering=False).fit_transform(featMatrix)

    nmfTrf = TruncatedSVD(n_components=10)
    nmfFeats = nmfTrf.fit_transform(featMatrixSTD)
    dfTest = paDataFrame(featMatrixSTD[:, :10])

    corr = np.dot(featMatrix, featMatrix.T)
    print(corr.shape)

    bandwidth = estimate_bandwidth(featMatrix, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth * 0.7, bin_seeding=True)
    print('bandwidth', bandwidth)
    labels = ms.fit_predict(featMatrix)

    # db = DBSCAN(eps=0.2, min_samples=10,metric='precomputed')
Beispiel #42
0
                if index == 0:
                    plt.title(name, size=18)
                    plt.ylabel(str(k) + ' clusters', color='k', size=18)

                colors = np.array(
                    list(
                        islice(
                            cycle([
                                '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                                '#a65628', '#984ea3', '#999999', '#e41a1c',
                                '#dede00'
                            ]), int(max(y_pred) + 1))))

                # plt.scatter(X, np.zeros_like(X) + 0., s=10, color=colors[y_pred])

                minX, maxX = np.floor(X.min()), np.ceil(X.max())
                bins_plot = np.linspace(minX, maxX, axisplot)
                hist, bins = np.histogram(X, bins=bins_plot)
                minY, maxY = hist.min(), hist.max()

                width = np.diff(bins)
                center = (bins[:-1] + bins[1:]) / 2
                hist[np.abs(center).min() == np.abs(center)] = 0
                color = []
                print(name)
                for p in range(len(hist)):
                    c = y_pred[np.logical_and(X > bins[p], X < bins[p + 1])[:,
                                                                            0]]
                    if np.size(c) == 0:
                        color.append(0)
                        # print('-')