def PCA_f(X, soglia):
    """Trasforma un insieme di dati tramite l'analisi delle componenti
    principali.
    Inputs:
    X -- l'insieme di dati sottoforma di DataFrame
    soglia -- la frazione di varianza spiegata minima dell'insieme
    di dati finale rispetto a quello iniziale
    Outputs:
    X_df_transformed -- DataFrame composto dalle componenti principali,
    il numero di componenti principali viene determinato dal parametro
    di soglia dato in input
    """
    X_std = StandardScaler().fit_transform(X)
    X_std = (X_std - X_std.min(axis=0)) / (X_std.max(axis=0) -
                                           X_std.min(axis=0))
    X = pd.DataFrame(data=X_std, index=None, columns=X.columns)
    pca = PCA(n_components=len(X.columns))  # max number of pca
    pca.fit(X)
    # Standardizing the features
    X_transformed = pca.transform(X)
    var_cumulata = np.array([
        pca.explained_variance_ratio_[:i].sum()
        for i in range(1, len(X.columns))
    ]).round(2)
    idx_ok = np.argmax(var_cumulata >= soglia)
    pca_names = []
    for i in range(idx_ok):
        tmp = ["PCA", str(i + 1)]
        pca_names.append(" ".join(tmp))
    X_df_transformed = pd.DataFrame(data=X_transformed[:, :idx_ok],
                                    index=None,
                                    columns=pca_names)
    return X_df_transformed
Example #2
0
def analyze_regression(X, y, ax, cPoint, cSLine, metrics):
	try:
		_X = StandardScaler().fit_transform(X.values.reshape(-1, 1))
		_y = StandardScaler().fit_transform(y.values.reshape(-1, 1))

		_linear = LinearRegression().fit(_X, _y)
		_predict = _linear.predict(_X)
		ax.scatter(_X, _y, color = cPoint, marker = '.', alpha = .6, label = 'Dispersión')
		_pn, _px = _X.tolist().index(_X.min()), _X.tolist().index(_X.max())
		ax.plot((_X.min(), _X.max()), (_predict[_pn], _predict[_px]), color = cSLine, label = 'Recta de Regresión')

		ax.set_title('Regresión Lineal')
		ax.set_xticks(()); ax.set_yticks(())
		ax.legend()

		if metrics: _print_regressionMetrics(_linear, _X, _y, _predict)
	except Exception as e:
		return e
Example #3
0
def qt_analysis(res):
    qt = res.drop(res.columns.intersection(['N2', 'dt', 'f']), axis=1)
    qt.hist()
    plt.show()
    qt_std = StandardScaler().fit_transform(qt)

    max = qt_std.max(0)
    min = qt_std.min(0)

    schew = sum(qt_std > 1) / qt_std.shape[0] + \
            sum(qt_std < -1) / qt_std.shape[0]

    return schew
Example #4
0
def analyze_classification(X, y, ax, cContourf, cInlier, cOutlier, gamma, metrics):
	try:
		_X = StandardScaler().fit_transform(X.values.reshape(-1, 1))
		_y = StandardScaler().fit_transform(y.values.reshape(-1, 1))

		_padding = CONTOURF_CLASSIFICATION_MINING_PADDING
		_mesh_sted_size = CONTOURF_CLASSIFICATION_MINING_MESH_STEP_SIZE
		_X_min, _X_max = _X.min() - _padding, _X.max() + _padding
		_y_min, _y_max = _y.min() - _padding, _y.max() + _padding
		_mapx, _mapy = meshgrid(arange(_X_min, _X_max, _mesh_sted_size),
			arange(_y_min, _y_max, _mesh_sted_size))

		if gamma == 0: gamma = 'auto'

		_classifier = OneClassSVM(kernel = 'rbf', gamma = gamma,
			random_state = 0).fit(c_[_X, _y])

		_Z = _classifier.decision_function(c_[_mapx.ravel(), _mapy.ravel()])
		_predict = _classifier.predict(c_[_X, _y])

		ax.contourf(_mapx, _mapy, _Z.reshape(_mapx.shape), cmap = cContourf, alpha = .7)
		_sub_XIn, _sub_XOut = list(), list()
		_sub_yIn, _sub_yOut = list(), list()
		for i in range(_predict.size):
			if _predict[i] == 1:
				_sub_XIn.append(_X[i]); _sub_yIn.append(_y[i])
			else:
				_sub_XOut.append(_X[i]); _sub_yOut.append(_y[i])
		ax.scatter(_sub_XIn, _sub_yIn, c = cInlier, marker = '.', alpha = .6, label = 'Inliers')
		ax.scatter(_sub_XOut, _sub_yOut, c = cOutlier, marker = '.', alpha = .6, label = 'OutLiers')

		ax.set_title('SVM')
		ax.set_xticks(()); ax.set_yticks(())
		ax.legend()

		if metrics: _print_classificationMetrics(_classifier, _predict)
	except Exception as e:
		return e
Example #5
0
def standardize(array, name):
    """Recieves a dataFrame or Series (from pandas) and returns a numpy array with zero mean and unit variance."""
    # Transform to numpy array
    nparray = array.as_matrix().reshape(array.shape[0],1).astype('float32')
    print('------------')
    print(name)
    print('Different values before:', np.unique(nparray).shape[0])

    # Standardize the data
    nparray = StandardScaler().fit_transform(nparray)

    # Print some information
    print('Mean:', nparray.mean())
    print('Max:', nparray.max())
    print('Min:', nparray.min())
    print('Std:', nparray.std())
    print('Different values after:', np.unique(nparray).shape[0])

    return nparray
Example #6
0
df = pd.read_table('u.data',
                   names=col_names,
                   usecols=col_names[0:3],
                   dtype=np.int32)

# Process ratings and save to file
user_ratings = np.zeros([N, M])
for i in range(0, N):  # foreach user in dataset
    # foreach rating of a unique user, centre and normalise data
    u = df.loc[df['user id'] == (i + 1)]
    temp = np.array([k for j, k in zip(u['movie id'], u['rating'])])
    temp = temp.reshape(-1, 1)
    temp = StandardScaler(with_std=False).fit_transform(X=temp)
    temp = temp.reshape(len(temp))
    min_r = temp.min()
    max_r = temp.max()
    x = 0
    for j, k in zip(u['movie id'], u['rating']):
        # store (existing) ratings in array row, filling empty cells with 0
        user_ratings[i, (j - 1)] = np.interp(temp[x], [min_r, max_r], [0, 1])
        x += 1
np.save('user_ratings.npy', user_ratings)
df.drop(columns=col_names[0:3])

# Process user ids
# one-hot encoded set of ids equals NxN identity matrix
user_ids = np.identity(N, dtype=np.int32)
np.save('user_ids.npy', user_ids)

# Split train data into 5 folds and save results to file
train_ind = []
def Plot_Decision_Boundaries_2D(X1,
                                X2,
                                y,
                                Estimators,
                                Test_Size=0.3,
                                Random_State=None,
                                Scale=True,
                                Colour_Map=plt.cm.coolwarm,
                                Bright_Colour_Map=plt.cm.afmhot,
                                Alpha_Train=1,
                                Alpha_Test=0.6,
                                Certainty_Threshold=None,
                                Variable_Names=("Variable1", "Variable2"),
                                Delta=0.02):
    def Return_Most_Certain_Classification_Data(X,
                                                y,
                                                Model,
                                                Certainty_Thresh=0,
                                                Fit_First=False):

        if Fit_First:
            Model = Model.fit(X, y)
        if hasattr(Model, "predict_proba"):
            probabilities = Model.predict_proba(X)
        elif hasattr(Model, "decision_function"):
            probabilities = Model.decision_function(X)
        certainty_bool = np.amax(probabilities, axis=1) > Certainty_Thresh

        certain_predictors, certain_response = X[certainty_bool], y[
            certainty_bool]
        print("Old number of samples:", len(y))
        print("New number of samples:", len(certain_response))

        return certain_predictors, certain_response

    if Certainty_Threshold != None:
        X_Combined = np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1)))
        X, y = Return_Most_Certain_Classification_Data(
            X_Combined,
            y,
            Model=Estimator,
            Certainty_Thresh=Certainty_Threshold,
            Fit_First=True)
        X1, X2 = X[:, 0], X[:, 1]

    #Define a class bijection for class colour mapping
    unique_classes, y_bijection = np.unique(y, return_inverse=True)

    #Sort the data so colour labels match up with actual labels
    X1, X2 = X1.reshape((-1, 1)), X2.reshape((-1, 1))
    y_bijection = y_bijection.reshape((-1, 1))

    Full_combined = np.hstack((X1, X2, y_bijection))
    Full_combined = Full_combined[Full_combined[:, 2].argsort()]

    X1, X2 = Full_combined[:, 0].reshape((-1, 1)), Full_combined[:, 1].reshape(
        (-1, 1))
    y_bijection = Full_combined[:, 2].reshape((-1, 1))

    #Preprocess the data if needed:
    X1, X2 = StandardScaler().fit_transform(
        X1), StandardScaler().fit_transform(X2)

    delta = Delta  #Step size in the mesh

    figure = plt.figure(figsize=(12, 8))

    x1_min, x1_max = X1.min() - 0.5, X1.max() + 0.5
    x2_min, x2_max = X2.min() - 0.5, X2.max() + 0.5

    xx, yy = np.meshgrid(np.arange(x1_min, x1_max, delta),
                         np.arange(x2_min, x2_max, delta))

    #Plot the given data (colourmap)

    col_map = Colour_Map
    col_map_bright = Bright_Colour_Map

    #Ready a train test split
    Full_combined = np.hstack((X1, X2, y_bijection))

    X_train, X_test, y_train, y_test = train_test_split(
        Full_combined[:, [0, 1]],
        Full_combined[:, 2],
        test_size=Test_Size,
        random_state=Random_State)

    #Get a figure and axes based on how many estimators (1 or multiple there are)
    #Multiple estimators
    if isinstance(Estimators, (list, type(np.array))):
        n_rows = len(Estimators)

        fig, axes = plt.subplots(nrows=n_rows,
                                 ncols=2,
                                 sharex=True,
                                 sharey=True,
                                 figsize=(12, n_rows * 4))
    #One estimator
    else:
        Estimators = np.array([Estimators])
        fig, axes = plt.subplots(1, 2, figsize=(12, 8))
        axes = np.array([axes])

    for axs, Estimator in zip(axes[:], Estimators):

        ax1, ax2 = axs[0], axs[1]

        ax1.set_title("Input Data")
        #Plot Training data
        scat = ax1.scatter(X_train[:, 0],
                           X_train[:, 1],
                           c=y_train,
                           cmap=col_map_bright,
                           edgecolors='k',
                           alpha=Alpha_Train)
        #And testing data
        ax1.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c=y_test,
                    cmap=col_map_bright,
                    edgecolors='k',
                    alpha=Alpha_Test)

        ax1.set_xlim(xx.min(), xx.max())
        ax1.set_ylim(yy.min(), yy.max())

        ax1.set_xlabel(Variable_Names[0])
        ax1.set_ylabel(Variable_Names[1])

        #Now for the classifier

        model = Estimator.fit(X_train, y_train)
        score = model.score(X_test, y_test)

        #Plot the decision boundary. For that, we will assign a colour to each point
        # in the mesh [x1_min, x1_max]*[x2_min, x2_max]

        if hasattr(model, "decision_function"):
            Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])

        elif hasattr(model, "predict_proba"):
            Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])

        else:
            print(
                "This Estimator doesn't have a decision_function attribute and can't predict probabilities"
            )

        Z = np.argmax(Z, axis=1)
        Z_uniques = np.unique(Z)

        unique_predictions = unique_classes[Z_uniques]

        #Put the result in a colourplot

        Z = Z.reshape(xx.shape)

        contour = ax2.pcolormesh(xx,
                                 yy,
                                 Z,
                                 vmin=Z.min(),
                                 vmax=Z.max(),
                                 cmap=col_map,
                                 alpha=0.7)

        #Plot also the training data
        ax2.scatter(X_train[:, 0],
                    X_train[:, 1],
                    c=y_train,
                    cmap=col_map_bright,
                    edgecolors='k',
                    alpha=Alpha_Train)
        #And testing data
        ax2.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c=y_test,
                    cmap=col_map_bright,
                    edgecolors='k',
                    alpha=Alpha_Test)

        ax2.set_xlim(xx.min(), xx.max())
        ax2.set_ylim(yy.min(), yy.max())

        ax2.set_xlabel(Variable_Names[0])
        ax2.set_ylabel(Variable_Names[1])
        ax2.set_title(str(Estimator))

        ax2.text(xx.max() - .3,
                 yy.min() + .3, ('%.2f' % score).lstrip('0'),
                 size=15,
                 horizontalalignment='right')

        cb1 = plt.colorbar(scat,
                           spacing="proportional",
                           ax=ax1,
                           ticks=np.arange(len(unique_classes)))
        cb1.ax.set_yticklabels(unique_classes)

        print("Unique Predictions: {}".format(unique_classes[Z_uniques]),
              "for: {}".format(Estimator))

        ticks = np.linspace(Z.min(), Z.max(), len(unique_predictions))

        cb2 = plt.colorbar(contour,
                           spacing="proportional",
                           ax=ax2,
                           ticks=ticks)
        cb2.ax.set_yticklabels(unique_predictions)

        #Also print the score of the model
        print("Model Score:", score, "\n")

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.suptitle("Data and Classification Boundaries", fontsize=20)

    return fig
Example #8
0
def heatmap(x, row_header, column_header, row_method,column_method, row_metric, column_metric,color_gradient, html_folder):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        """
        This below code is based in large part on the protype methods:
        http://old.nabble.com/How-to-plot-heatmap-with-matplotlib--td32534593.html
        http://stackoverflow.com/questions/7664826/how-to-get-flat-clustering-corresponding-to-color-clusters-in-the-dendrogram-cre
        x is an m by n ndarray, m observations, n genes
        """
        
        ### Define the color gradient to use based on the provided name
        n = len(x[0]); m = len(x)
        if color_gradient == 'red_white_blue':
            cmap=plt.cm.bwr
        if color_gradient == 'red_black_sky':
            cmap=RedBlackSkyBlue()
        if color_gradient == 'red_black_blue':
            cmap=RedBlackBlue()
        if color_gradient == 'red_black_green':
            cmap=RedBlackGreen()
        if color_gradient == 'yellow_black_blue':
            cmap=YellowBlackBlue()
        if color_gradient == 'seismic':
            cmap=plt.cm.seismic
        if color_gradient == 'green_white_purple':
            cmap=plt.cm.PiYG_r
        if color_gradient == 'coolwarm':
            cmap=plt.cm.coolwarm

        ### Scale the max and min colors so that 0 is white/black
        x = StandardScaler().fit_transform(x)
        vmin = x.min()
        vmax = x.max()
        vmax = max([vmax,abs(vmin)])
        vmin = vmax*-1
        norm = mpl.colors.Normalize(vmin, vmax) ### adjust the max and min to scale these colors

        ### Scale the Matplotlib window size
        default_window_height = 7
        default_window_width = 11
        fig = plt.figure(figsize=(default_window_width,default_window_height)) ### could use m,n to scale here
        color_bar_w = 0.015 ### Sufficient size to show
            
        ## calculate positions for all elements
        # ax1, placement of dendrogram 1, on the left of the heatmap
        [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05,0.22,0.2,0.6]   ### The second value controls the position of the matrix relative to the bottom of the view
        width_between_ax1_axr = -0.004
        height_between_ax1_axc = -0.004 ### distance between the top color bar axis and the matrix
        
        # axr, placement of row side colorbar
        [axr_x, axr_y, axr_w, axr_h] = [0.31,0.1,color_bar_w,0.6] ### second to last controls the width of the side color bar - 0.015 when showing
        axr_x = ax1_x + ax1_w + width_between_ax1_axr
        axr_y = ax1_y; axr_h = ax1_h
        width_between_axr_axm = -0.004

        # axc, placement of column side colorbar
        [axc_x, axc_y, axc_w, axc_h] = [0.4,0.63,0.5,color_bar_w] ### last one controls the height of the top color bar - 0.015 when showing
        axc_x = axr_x + axr_w + width_between_axr_axm
        axc_y = ax1_y + ax1_h + height_between_ax1_axc
        height_between_axc_ax2 = -0.004

        # axm, placement of heatmap for the data matrix
        [axm_x, axm_y, axm_w, axm_h] = [0.4,0.9,2.5,0.5]
        axm_x = axr_x + axr_w + width_between_axr_axm
        axm_y = ax1_y; axm_h = ax1_h
        axm_w = axc_w

        # ax2, placement of dendrogram 2, on the top of the heatmap
        [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3,0.72,0.6,0.15] ### last one controls height of the dendrogram
        ax2_x = axr_x + axr_w + width_between_axr_axm
        ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
        ax2_w = axc_w

        # axcb - placement of the color legend
        [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07,0.88,0.18,0.07]

        # Compute and plot top dendrogram
        if column_method != None:
            d2 = dist.pdist(x.T)
            D2 = dist.squareform(d2)
            ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=False)
            Y2 = sch.linkage(D2, method=column_method, metric=column_metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete'
            Z2 = sch.dendrogram(Y2)
            ind2 = sch.fcluster(Y2,0.7*max(Y2[:,2]),'distance') ### This is the default behavior of dendrogram
            ax2.set_xticks([]) ### Hides ticks
            ax2.set_yticks([])
        else:
            ind2 = ['NA']*len(column_header) ### Used for exporting the flat cluster data
            
        # Compute and plot left dendrogram.
        if row_method != None:
            d1 = dist.pdist(x)
            D1 = dist.squareform(d1)  # full matrix
            ax1 = fig.add_axes([ax1_x+0.005, ax1_y, ax1_w, ax1_h], frame_on=False) # frame_on may be False
            Y1 = sch.linkage(D1, method=row_method, metric=row_metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete'
            Z1 = sch.dendrogram(Y1, orientation='right')
            ind1 = sch.fcluster(Y1,0.7*max(Y1[:,2]),'distance') ### This is the default behavior of dendrogram
            ax1.set_xticks([]) ### Hides ticks
            ax1.set_yticks([])
        else:
            ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data
            
        # Plot distance matrix.
        axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h])  # axes for the data matrix
        xt = x
        if column_method != None:
            idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data
            xt = xt[:,idx2]
            # ind2 = ind2[:,idx2] ### reorder the flat cluster to match the order of the leaves the dendrogram
        if row_method != None:
            idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data
            xt = xt[idx1,:]   # xt is transformed x
            # ind1 = ind1[idx1,:] ### reorder the flat cluster to match the order of the leaves the dendrogram
        ### taken from http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python/3011894#3011894
        im = axm.matshow(xt, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black
        axm.set_xticks([]) ### Hides x-ticks
        axm.set_yticks([])

        # Add text
        new_row_header=[]
        new_column_header=[]
        for i in range(x.shape[0]):
            if row_method != None:
                if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows
                    if len(row_header) < 20:
                        fontsize=15
                    else:
                        fontsize=200/len(row_header)
                    axm.text(x.shape[1]-0.5, i-0.1, '  '+row_header[idx1[i]],fontsize=fontsize)
                new_row_header.append(row_header[idx1[i]])
            else:
                if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows
                    if len(row_header) < 20:
                        fontsize=8
                    else:
                        fontsize=200/len(row_header)
                    axm.text(x.shape[1]-0.5, i-0.1, '  '+row_header[i],fontsize=fontsize) ### When not clustering rows
                new_row_header.append(row_header[i])
        for i in range(x.shape[1]):
            if column_method != None:
                axm.text(i, -0.55, ' '+column_header[idx2[i]], rotation=315, verticalalignment="top") # rotation could also be degrees
                new_column_header.append(column_header[idx2[i]])
            else: ### When not clustering columns
                axm.text(i, -0.55, ' '+column_header[i], rotation=315, verticalalignment="top")
                new_column_header.append(column_header[i])

        # Plot colside colors
        # axc --> axes for column side colorbar
        # if column_method != None:
        #     axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h])  # axes for column side colorbar
        #     cmap_c = mpl.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
        #     dc = np.array(ind2, dtype=int)
        #     dc.shape = (1,len(ind2)) 
        #     im_c = axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c)
        #     axc.set_xticks([]) ### Hides ticks
        #     axc.set_yticks([])
        
        # Plot rowside colors
        # axr --> axes for row side colorbar
        # if row_method != None:
        #     axr = fig.add_axes([axr_x+0.005, axr_y, axr_w-0.005, axr_h])  # axes for column side colorbar
        #     dr = np.array(ind1, dtype=int)
        #     dr.shape = (len(ind1),1)
        #     cmap_r = mpl.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
        #     im_r = axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r)
        #     axr.set_xticks([]) ### Hides ticks
        #     axr.set_yticks([])

        # Plot color legend
        axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False)  # axes for colorbar
        cb = mpl.colorbar.ColorbarBase(axcb, cmap=cmap, norm=norm, orientation='horizontal')
        cb.set_ticks([vmin,0,vmax])
        axcb.set_title("Normalized Expression")

        #Save figures
        plt.savefig(os.path.join(html_folder,'Heatmap.png'), dpi=300)
        plt.savefig(os.path.join(html_folder,'Heatmap.svg'))

        #Create html output file
        html_file = """<!DOCTYPE html>
<html>
    <head>
        <title>Heatmap</title>
        <style>
        * {
            font-family:Arial, Helvetica, sans-serif;
            }
        </style>
    </head>
    <body style="width:100%">
        <img style="width:100%" src="Heatmap.png" alt="Heatmap">
        <a href="index.html"><b>Back</b></a>
    </body>
</html>
"""
        with open(os.path.join(html_folder,'Heatmap.html'),'w') as F:
            F.write(html_file)
Example #9
0
                if index == 0:
                    plt.title(name, size=18)
                    plt.ylabel(str(k) + ' clusters', color='k', size=18)

                colors = np.array(
                    list(
                        islice(
                            cycle([
                                '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                                '#a65628', '#984ea3', '#999999', '#e41a1c',
                                '#dede00'
                            ]), int(max(y_pred) + 1))))

                # plt.scatter(X, np.zeros_like(X) + 0., s=10, color=colors[y_pred])

                minX, maxX = np.floor(X.min()), np.ceil(X.max())
                bins_plot = np.linspace(minX, maxX, axisplot)
                hist, bins = np.histogram(X, bins=bins_plot)
                minY, maxY = hist.min(), hist.max()

                width = np.diff(bins)
                center = (bins[:-1] + bins[1:]) / 2
                hist[np.abs(center).min() == np.abs(center)] = 0
                color = []
                print(name)
                for p in range(len(hist)):
                    c = y_pred[np.logical_and(X > bins[p], X < bins[p + 1])[:,
                                                                            0]]
                    if np.size(c) == 0:
                        color.append(0)
                        # print('-')
        final_df.loc[indicesToKeep, "LDA 1"],
        final_df.loc[indicesToKeep, "LDA 2"],
        c=color,
        s=50,
    )
ax.legend(targets)
ax.grid()
# -

# <a name="3-3"></a>
# ## 3.3 Non-Negative Matrix Factorization (NMF)
# Source: [Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html)
#
# Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction.

print(X.max())
print(X.min())

# In order to use NMF, our data cannot contain negative values. For that reason, we wil use `MinMaxScaler` from sklearn which scales the data in a given range. For example, range (0,1).
#
# `MinMaxScaler` is equivalent to the code below:
#
# ```python
# X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
# X_scaled = X_std * (max - min) + min
# ```

# +
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
    new_rows_list = []
    for row in csv_reader:
        row[1] = labels[i]
        i = i + 1
        new_rows_list.append(row)
#for data in new_rows_list:
#  data[i][1]=''.join(labels[i])
# i = i+1
print(new_rows_list)
with open('Crops_MIR.csv', 'w', newline='') as write_csv:
    csv_writer = csv.writer(write_csv)
    csv_writer.writerows(new_rows_list)
cols = ['Name', 'Labels', 'Water Require', 'Temp', 'Moisture', 'Production']
data = pd.read_csv(r'Crops_MIR.csv', names=cols)
y = data['Labels']
X_norm = (X - X.min()) / (X.max() - X.min())
lda = LDA(n_components=1)
lda_transformed = pd.DataFrame(lda.fit_transform(X_norm, y))
#print(lda_transformed)
for i in range(3):
    plt.scatter(lda_transformed[y == i],
                data[y == i]['Water Require'],
                color=colmap[i])
plt.show()
#min_required = min(data[y==2]['Water Require'])

#Get Current Data from Farm to predict list of next possible crops
blob = bucket.get_blob("Current_Data.csv")
blob.download_to_filename("currentcropdata.csv")
cols1 = ['Humidity', 'Temperature', 'Distance', 'Moisture']
current_data = pd.read_csv(r'Crops_MIR.csv', names=cols1)
Example #12
0
def residual_plot(model, X, Y):
    """This function plots residual-plot for a regressor.
	
	X, y : np.ndarray
	model : estimator object. Should have 'fit' and 'predict' methods.
	"""
    x_train, x_test, y_train, y_test = split_data(X, Y)
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    res_train = y_train - y_pred_train
    res_test = y_test - y_pred_test
    
    fig, [ax0, ax1] = plt.subplots(2, 1, figsize=(14, 10))
    tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
                 (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
                 (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
                 (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
                 (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
    tableau20 = [(i[0]/255., i[1]/255., i[2]/255.) for i in tableau20]
    %matplotlib inline
    
    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02
    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    ################################
    # Plot res-plot for training set
    x = StandardScaler().fit_transform(y_pred_train.reshape(-1, 1))
    y = StandardScaler().fit_transform(res_train.reshape(-1, 1))
    fig1 = plt.figure(figsize=(14, 10))
    fig1.suptitle('Residual plot for training set')
    
    # start with a rectangular Figure
    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)
    
    # the scatter plot:
    axScatter.scatter(x, y, color=tableau20[0], alpha=0.5)
    
    # now determine nice limits by hand:
    n_bins = 100

    x_limp = x.max() + x.std()
    x_limn = x.min() - x.std()
    y_limp = y.max() + y.std()
    y_limn = y.min() - y.std()

    axScatter.set_xlim((x_limn, x_limp))
    axScatter.set_ylim((y_limn, y_limp))
    axScatter.set_xlabel('Estimated output')
    axScatter.set_ylabel('Residuals')

    axHistx.hist(x, bins=n_bins, color=tableau20[1], alpha=0.75)
    axHisty.hist(y, bins=n_bins, orientation='horizontal', color=tableau20[2], alpha=0.75)

    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())
    
    
    ################################
    # Plot res-plot for testing set
    x = StandardScaler().fit_transform(y_pred_test.reshape(-1, 1))
    y = StandardScaler().fit_transform(res_test.reshape(-1, 1))
    fig2 = plt.figure(figsize=(14, 10))
    fig2.suptitle('Residual plot for testing set')
    
    # start with a rectangular Figure
    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)
    
    # the scatter plot:
    axScatter.scatter(x, y, color=tableau20[0], alpha=0.5)
    
    # now determine nice limits by hand:
    n_bins = 100

    x_limp = x.max() + x.std()
    x_limn = x.min() - x.std()
    y_limp = y.max() + y.std()
    y_limn = y.min() - y.std()

    axScatter.set_xlim((x_limn, x_limp))
    axScatter.set_ylim((y_limn, y_limp))
    axScatter.set_xlabel('Estimated output')
    axScatter.set_ylabel('Residuals')

    axHistx.hist(x, bins=n_bins, color=tableau20[1], alpha=0.75)
    axHisty.hist(y, bins=n_bins, orientation='horizontal', color=tableau20[2], alpha=0.75)

    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())
    plt.show()
Example #13
0
    QuadraticDiscriminantAnalysis()
]

X = data_pca_tsne

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets

# preprocess dataset, split into training and test part

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.4, random_state=42)

x_min, x_max = X.min() - .5, X.max() + .5
y_min, y_max = y.min() - .5, y.max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# just plot the dataset first
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0],
           X_train[:, 1],
           c=y_train,
           cmap=cm_bright,
           edgecolors='k')
# and testing points
Example #14
0
print(df.head())
print(df.describe())

from sklearn.preprocessing import StandardScaler
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:, ['target']].values  # Access column by name
# Standardizing the features
x = StandardScaler().fit_transform(x)

print(x[:5, ])
print(x.min(axis=0))
print(x.max(axis=0))

#####
# PCA Projection to 2D
#####

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(
    data=principalComponents,
    columns=['principal component 1', 'principal component 2'])

finalDf = pd.concat([principalDf, df[['target']]], axis=1)
print(finalDf.head())
    if (label[i] == 4 or label[i] == 1 or label[i] == 8 or label[i] == 7
            or label[i] == 9 or label[i] == 2 or label[i] == 6):
        X_train1.append(a[i])
X_train1 = np.array(X_train1)
y_train1 = np.array(y_train1)

from sklearn.utils import shuffle
X_train1, y_train1 = shuffle(X_train1, y_train1, random_state=0)
from sklearn.preprocessing import StandardScaler
X_train1 = StandardScaler().fit_transform(X_train1)
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
X_train1 = pca.fit_transform(X_train1)
print(X_train1.shape)

print(X_train.max())
print(X_train1.max())
X_train = X_train.astype('float32')
X_train1 = X_train1.astype('float32')
X_train = X_train / 100
X_train1 = X_train1 / 100

X_test = X_train1[20000:39332, :]
y_test = y_train1[20000:39332]
X_train1 = X_train1[0:20000, :]
y_train1 = y_train1[0:20000]

print(X_train.shape)
print(X_train1.shape)
print(X_test.shape)
Example #16
0
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

for i in range(2):
    fig, axes = plt.subplots(3, 6)
    axes = axes.ravel()
    for j in range(len(axes)):
        feature = StandardScaler().fit_transform(X_train[y_train == i,
                                                         j:j + 1])
        hist = axes[j].hist(feature,
                            bins='auto',
                            histtype='step',
                            linewidth=2,
                            density=True)
        grid = np.linspace(feature.min(), feature.max(), num=1000)
        log_density = (GaussianMixture(
            n_components=10,
            reg_covar=0.03).fit(feature).score_samples(grid[:, None]))
        gmm = axes[j].plot(grid, np.exp(log_density))
        axes[j].set_title(f'var_{j}', **title_config)
        axes[j].set_ylim([0, 1])
    fig.suptitle(f'Histogram vs Gaussian Mixture Model for Class {i}',
                 **title_config)
    fig.legend((hist[2][0], gmm[0]), ('Histogram', 'Gaussian mixture model'),
               loc='upper center',
               bbox_to_anchor=(0.5, 1),
               ncol=2,
               fontsize=14)
    fig.tight_layout()
    fig.subplots_adjust(top=0.88)