Esempi in Python per scatter_matrix, esempi in Python per pandas.scatter_matrix

Esempio n. 1

0

Mostra file

File: photometricRedshifts.py Progetto: eddienko/EuclidVisibleInstrument

def loadSDSSdata(folder='/Users/sammy/Google Drive/MachineLearning/AstroSDSS/', filename="qso10000.csv",
                 plot=False):
    """
    Load SDSS QSO data. The redshift range is rather broard from about 0.3 to 6.
    """
    filename = folder + filename
    qsos = pd.read_csv(filename,index_col=0, usecols=["objid","dered_r","spec_z","u_g_color",
                                                      "g_r_color","r_i_color","i_z_color","diff_u",
                                                      "diff_g1","diff_i","diff_z"])

    qsos = qsos[(qsos["dered_r"] > -9999) & (qsos["g_r_color"] > -10) & (qsos["g_r_color"] < 10)]
    qso_features = copy.copy(qsos)
    qso_redshifts = qsos["spec_z"]
    del qso_features["spec_z"]

    if plot:
        ## truncate the color at z=2.5 just to keep some contrast.
        norm = mpl.colors.Normalize(vmin=min(qso_redshifts.values), vmax=2.5)
        cmap = cm.jet_r
        m = cm.ScalarMappable(norm=norm, cmap=cmap)
        pd.scatter_matrix(qso_features[0:2000], alpha=0.2, figsize=[15, 15],
                          color=m.to_rgba(qso_redshifts.values))
        plt.savefig('Sample.pdf')
        plt.close()

    X_train, X_test, y_train, y_test = train_test_split(qso_features.values, qso_redshifts.values,
                                                        random_state=42)

    print "feature vector shape=", qso_features.values.shape
    print 'Training sample shape=', X_train.shape
    print 'Testing sample shape=', X_test.shape

    return X_train, X_test, y_train, y_test

Esempio n. 2

0

Mostra file

File: make_full_feature.py Progetto: ajschumacher/Craigslist-Arbitrage

def feature_m(df_all):
    df_X = df_all[['upgraded_HD', 
                   'upgraded_cpu', 
                   'upgraded_memory', 
                   'apple_care',
                   'year',
                   'px', 
                   'cpu_speed',
                   'image_url_ct',
                   'memory',
                   'HD_size']]

    df_X['apple_care'] = binarize_boolean_series(df_X['apple_care'])
    df_X['upgraded_HD'] = binarize_boolean_series(df_X['upgraded_HD'])
    df_X['upgraded_memory'] = binarize_boolean_series(df_X['upgraded_memory'])
    df_X['upgraded_cpu'] = binarize_boolean_series(df_X['upgraded_cpu'])        
    df_X['year'] = df_X['year'].astype(int)
    df_X['px'] = df_X['px'].astype(int)
    df_X['cpu_speed'] = df_X['cpu_speed'].astype(float)
    df_X['HD_size'] = df_X['HD_size'].astype(float)
    df_X['memory'] = df_X['memory'].astype(int)

    pd.scatter_matrix(df_X, figsize=(15,15));
    y = df_X.pop('year').ravel()
    X = np.array(df_X)
    return X, y

Esempio n. 3

0

Mostra file

File: test_deprecated.py Progetto: Jengel1/SunriseSunsetTimeFinder

    def test_scatter_plot_legacy(self):
        df = pd.DataFrame(randn(100, 2))

        with tm.assert_produces_warning(FutureWarning):
            plotting.scatter_matrix(df)

        with tm.assert_produces_warning(FutureWarning):
            pd.scatter_matrix(df)

Esempio n. 4

0

Mostra file

File: DyStatsTableWidget.py Progetto: hack1943/DevilYuan

    def _scatterMatrixAct(self):
        df = self.getNumberDataFrame()
        if df is None: return

        DyMatplotlib.newFig()

        pd.scatter_matrix(df)
        plt.gcf().show()

Esempio n. 5

0

Mostra file

File: movie_analyze_graph.py Progetto: agatorano/Hollywood_Projection

def show_scatter(data, col):

    '''
    shows a scatter matrix of the data
    '''

    if col:
        pd.scatter_matrix(data[col], figsize=(10, 10))
    else:
        pd.scatter_matrix(data, figsize=(10, 10))

Esempio n. 6

0

Mostra file

File: chap8.py Progetto: makora9143/python_for_data_analysis

def slide_13():
    macro = pd.read_csv(MACRODATAPATH)
    data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
    trans_data = np.log(data).diff().dropna()
    print trans_data[-5:]

    plt.scatter(trans_data['m1'], trans_data['unemp'])
    plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))

    pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)

Esempio n. 7

0

Mostra file

File: betweenness_centrality.py Progetto: jpfairbanks/asonam2013

def scatter_matrix_topp(sorted_frame, selected_axes, percentile=1):
    """

    Arguments:
    - `sorted_frame`:
    - `selected_axes`: the axes to include in .the scatterplot matrix
    - `percentile`:
    """
    pd.scatter_matrix(
        np.log(sorted_frame[selected_axes]+1)[:(percentile*len(sorted_frame)/100)]
        )

Esempio n. 8

0

Mostra file

File: default.py Progetto: dmnfarrell/epitopemap

def scoreCorrelations(preds):
    figs=[]
    for p in preds:
        pred=preds[p]
        df=pred.data
        x = df.pivot_table(index='peptide', columns='allele', values=pred.scorekey)
        f=plt.figure()
        ax=f.add_subplot(111)
        pd.scatter_matrix(x, alpha=0.2, figsize=(12,12), diagonal='hist',ax=ax)
        #plt.tight_layout()
        figs.append(f)
    return figs

Esempio n. 9

0

Mostra file

File: gmm_mixture_sampling.py Progetto: kadeng/pymc

def plot_scatter_matrix(title, tr, fig=None):
    if (fig is None):
        fig = plt.Figure()
    t6 = pandas.Series(tr['c'])
    t8 = pandas.Series(tr['gmm'][:,0])
    t9 = pandas.Series(tr['gmm'][:,1])
    t10 = pandas.Series(tr['gmm_p'][:,0])
    t11 = pandas.Series(tr['pbeta'])
    df = pandas.DataFrame({'cat' : t6, 'gmm_0' : t8, 'gmm_1' : t9, 'p' : t10, 'pbeta' : t11})
    pandas.scatter_matrix(df)
    plt.title(title)
    return fig

Esempio n. 10

0

Mostra file

File: Graphs.py Progetto: alanhdu/Dex

    def createMatrix(self, event):
        # TODO Fix ugly gridlines. sns.setStyle('nogrid') failed
        dlg = GraphDialog(self.parent, "Matrix Plot Input", ("Select Data",), 
                size=(500, 300), groups=False)

        if dlg.ShowModal() == wx.ID_OK:
            ds = [d[0] for d in dlg.GetName()]
            df = self.parent.data[ds]
            n = len(ds)
            dlg.Destroy()
                
            pd.scatter_matrix(df, grid=False)
            plt.show()

Esempio n. 11

0

Mostra file

File: PrepareData.py Progetto: LevinJ/machine-learning

    def performScaling(self):
        self.log_data = pd.DataFrame(np.log(self.data), columns=self.data.columns)
        self.log_samples = pd.DataFrame(np.log(self.samples), columns=self.samples.columns)
        fname = "customers_log.csv"
        if not os.path.isfile(fname):
            self.log_data.to_csv(fname)
            scaler = preprocessing.StandardScaler()
            self.data_log_std = pd.DataFrame(scaler.fit_transform(self.log_data), columns=self.log_data.columns)
            self.data_log_std.to_csv("customers_log_std.csv")
        
        pd.scatter_matrix(self.log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde')
        print(self.log_samples)
#         plt.show()
        return

Esempio n. 12

0

Mostra file

File: visualization.py Progetto: Saynah/AthosPy

def plot_feature_scatter(df_feat, df_files, write_dst=''):
    '''Plot scatter matrix for all features.
    Save Exercise-labeled version of scatter plot for inspection'''
    
    # visualize features in the test set
    ax = pd.scatter_matrix(df_feat, alpha=0.2, figsize=(15, 15), diagonal='kde');

    # remove axis labels
    for axi in ax:
        for axij in axi:    
            axij.set_yticks([])
            axij.set_xticks([])

    if write_dst:
        # also create and save a version of this plot with points colored by exercise label
        df_labeled = df_feat.join(df_files.Exercise)

        g = sns.PairGrid(df_labeled, hue="Exercise")
        g.map_upper(plt.scatter, alpha=0.2)
        g.map_diag(plt.hist)
        # g.map_lower(sns.kdeplot, alpha=0.2, cmap='Greys_d')  # trouble calculating the kde

        g.add_legend()
        g.savefig(write_dst)
        plt.close() # don't create the plot here

    return ax

Esempio n. 13

0

Mostra file

File: master_plotter.py Progetto: nhu2000/carl_capstone

def make_scatter_plots(features_of_interest, df):
    '''
    This function makes bivariate scatter matrix plot for the
    inputed features of interest, which are typically the 
    individual features of the greatest importance in our 
    supervised learning classification model
    INPUTS: features_of_interest = list of strings; df =  pandas
    data frame containing song feature data
    '''
    plt.figure
    '''get mask containing songs used in our model'''
    good_mask = np.load('good_mask.npy')
    df = df[good_mask]
    contains_outliers = 'B- Var(c.t.)'
    '''
    remove outliers in the 'B- Var(c.t.)' feature to better see plots
    '''
    df = df[np.abs(df[contains_outliers]\
    - df[contains_outliers].mean()) / df[contains_outliers].std() <= 2.3 ]
    df_trunc = df[features_of_interest]
    color_dict = dict()
    '''label data points by color'''
    color_dict['tec'] = 'b'
    color_dict['hip'] = 'r'
    color_dict['cla'] = 'g'
    color_dict['roc'] = 'k'
    color_dict['pop'] = 'c'
    color_set = np.array([color_dict[name] for name in df['Label']])
    ax = pd.scatter_matrix(df_trunc, color = color_set)
    plt.xlabel([])
    plt.ylabel([])

Esempio n. 14

0

Mostra file

File: linear_regression.py Progetto: askerry/thinkful

def exploratory_viz(loansData):
    plt.figure()
    p = loansData['FICO.Score'].hist()
    plt.savefig('../figs/fico_score_hist.png')

    a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(14,14))
    plt.savefig('../figs/loan_scatter_matrix.png')

Esempio n. 15

0

Mostra file

File: linear_regression.py Progetto: hughdbrown/linear_regression

def plot_data(loansData):
    plt.figure()
    p = loansData['FICO.Score'].hist(bins=20)
    plt.show()

    a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist')
    plt.show()

Esempio n. 16

0

Mostra file

File: analyse_locations.py Progetto: jaron/deeper-learning

def scale_features(property_data, samples):

	# Scale the data using the natural logarithm
	log_data = property_data
	log_data['Price'] = np.log(property_data['Price'])

	# Scale the sample data using the natural logarithm
	log_samples = samples
	log_samples['Price'] = np.log(samples['Price'])
	print "\nSamples after scaling:"
	display(log_samples)

	# Produce a scatter matrix for each pair of newly-transformed features
	pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14, 8), diagonal = 'kde')
	plt.show()
	return log_data, log_samples

Esempio n. 17

0

Mostra file

File: explore.py Progetto: bchaplin1/hazard

def visualize(data):
    # visualization
    import seaborn as sns
    import matplotlib.pyplot as plt

    # scatter matrix in Seaborn
    sns.pairplot(data)

    # scatter matrix in Pandas
    pd.scatter_matrix(data, figsize=(12, 10))

    # Use a **correlation matrix** to visualize the correlation between all numerical variables.

    # compute correlation matrix
    data.corr()

    # display correlation matrix in Seaborn using a heatmap
    sns.heatmap(data.corr())

Esempio n. 18

0

Mostra file

File: problem4a.py Progetto: GucciTheCarpenter/CUSP

def openFile(filename):
	
	df_genes = pd.read_csv(filename)
	df_genes2 = df_genes[['A', 'C', 'D', 'B']]
	
	# print df_genes2.head()
	# plt.show()
	gene_scatter = pd.scatter_matrix(df_genes2)
	gene_scatter
	plt.show()

Esempio n. 19

0

Mostra file

File: machine_learn.py Progetto: ktalik/ml-and-conflict-prevention-python

def colored_scatter_matrix(data, colors, title, save=None):
    """ Scatter matrix with parametrized colors (e.g. classes) """
    print 'Plot scatter matrix...'
    fig, ax = plt.subplots(figsize=(12.0, 7.5))
    pd.scatter_matrix(
        data,
        diagonal='kde',
        figsize=(10, 10),
        ax=ax,
        c=colors,
        cmap=None
    )
    ax.set_title(title)
    if save:
        fig.savefig(save)
    else:
        mng = plt.get_current_fig_manager()
        mng.window.showMaximized()
        plt.show()

Esempio n. 20

0

Mostra file

File: iris_classifier.py Progetto: muyun/dev.machinelearning

def get_iris_dataset():
    iris_dataset = load_iris()

    #1. The format of the dataset
    print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))
    print("Target names: {}".format(iris_dataset['target_names']))
    print("Feature names: \n{}".format(iris_dataset['feature_names']))
    # data -> numpy.ndarray
    # row -> the labels
    # column -> the features
    print("Type of data: {}".format(iris_dataset['data'].shape))     # (150,4)
    print("Type of target: {}".format(iris_dataset['target'].shape)) # (150,)

    #import pdb; pdb.set_trace()

    #2. split the dataset into training set and testing set
    # y = f(X)
    X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'],test_size=0.2, random_state=0)
    print("X_train shape: {}".format(X_train.shape))
    print("y_train shape: {}".format(y_train.shape))

    print("X_test shape: {}".format(X_test.shape))
    print("y_test shape: {}".format(y_test.shape))


    #import pdb; pdb.set_trace()

    # 3. inspect the data - virtualize it
    # convert Numpy array int oa pandas DataFrame
    iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)

    # pdb; pdb.set_trace()
    grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15,15), marker='o', hist_kwds={'bins':20}, s=60, alpha=.8, cmap=mglearn.cm3)
    plt.show()


    #import pdb; pdb.set_trace()

    # The modelu
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=1)
    # build the model on the training set
    knn.fit(X_train, y_train)

    # the prediction
    X_new = np.array([[5, 2.9, 1, 0.2]])
    prediction = knn.predict(X_new)
    print("Prediction: {}".format(prediction))
    print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

    y_pred = knn.predict(X_test)
    print("Test set predictions:\n {}".format(y_pred))
    print("Test set score: {:.2f}".format(np.mean(y_pred==y_test)))

Esempio n. 21

0

Mostra file

File: u3l3_education_gdp_analysis.py Progetto: awaemmanuel/thinkful_datascience

def data_analysis_and_correlation(df_education, df_gdp):
    """ Analysis and Correlation education data with gdp. """
    print "[Data Analysis and Correlation of Education to GDP data] ==> Begin"
    common_countries = list(set(df_education['Country'].tolist()) & set(df_gdp['Country'].tolist()))
    gdp = []
    total_school_time = []
    men_school_time = []
    women_school_time = []
    for cntry in common_countries:
        df1 = df_education[df_education['Country'] == cntry]
        df2 = df_gdp[df_gdp['Country'] == cntry]
        if df2['GDP_'+ df1['Year'].iloc[0]].iloc[0] != '':
            total_school_time.append(int(df1['Total_School_Time'].iloc[0]))
            men_school_time.append(int(df1['Men_School_Time'].iloc[0]))
            women_school_time.append(int(df1['Women_School_Time'].iloc[0]))
            gdp.append(math.log(df2['GDP_'+ df1['Year'].iloc[0]].iloc[0]))
    df_edu_to_gdp = pd.DataFrame({'Total': total_school_time, 'Men': men_school_time, \
                                  'Women': women_school_time, 'GDP': gdp})    
    print df_edu_to_gdp.corr(), "\n"
    
    gdp_np_array = np.array(df_edu_to_gdp.GDP.tolist())
    for col in ['Women', 'Men', 'Total']:
        r_val, p_val = sp(gdp_np_array, np.array(df_edu_to_gdp[col].tolist()))
        print "Correlation of GDP against {}:".format(col)
        print "Pearsons correlation coefficient: {}".format(r_val)
        print "2-tailed p-values: {}\n".format(p_val)
        
    # Scatter matrix plot with histogram of data plots in the diagonal
    pd.scatter_matrix(df_edu_to_gdp, alpha=0.05, figsize=(10, 10), diagonal='hist')
    plt.savefig('figures/education_to_gdp/data_education_gdp_analysis.png')
    plt.clf()
#     
#         ==> Conclusion / Summary
#                    GDP       Men     Total     Women
#        GDP    1.000000  0.495794  0.479050  0.497923
#        Men    0.495794  1.000000  0.971663  0.942572
#        Total  0.479050  0.971663  1.000000  0.977217
#        Women  0.497923  0.942572  0.977217  1.000000
#       
    print """

Esempio n. 22

0

Mostra file

File: plots.py Progetto: mohanbolisetty/trans-seq

def scattermatrix(tables):
    fig = plot.figure(frameon=False,facecolor='white')
    index=common_index(tables)
    data=pd.DataFrame(index=index)
    for i in tables:
        data[i[0]]=i[1].ix[index]['MEDIAN']
    axs=pd.scatter_matrix(data, alpha=0.2, figsize=(8,8), diagonal='none', marker='.',)
    
    for ax in axs[:,0]:
        ax.grid('off', axis='both')
        ax.set_ylabel(wrap(ax.get_ylabel()), rotation=0, va='center', labelpad=30)
        ax.set_yticks([])
    for ax in axs[-1,:]:
        ax.grid('off', axis='both')
        ax.set_xlabel(wrap(ax.get_xlabel()), rotation=90)
        ax.set_xticks([])
    return fig

Esempio n. 23

0

Mostra file

File: plotting.py Progetto: twedlee/pandastable

    def _doplot(self, data, ax, kind, subplots, kwargs):
        """Do core plotting"""

        cols = data.columns
        rows = int(round(np.sqrt(len(data.columns)),0))
        if len(data.columns) == 1:
            kwargs['subplots'] = 0
        if kind == 'pie':
            kwargs['subplots'] = True
        if subplots == 0:
            layout = None
        else:
            layout=(rows,-1)
        if kind == 'bar':
            if len(data) > 50:
                ax.get_xaxis().set_visible(False)
            if len(data) > 400:
                print ('too many bars to plot')
                return
        if kind == 'scatter':
            axs = self.scatter(data, ax, **kwargs)
            if kwargs['sharey'] == 1:
                lims = self.fig.axes[0].get_ylim()
                for a in self.fig.axes:
                    a.set_ylim(lims)
        elif kind == 'boxplot':
            axs = data.boxplot(ax=ax, rot=kwargs['rot'], grid=kwargs['grid'])
            #boxplot won't accept required kwargs?
            if kwargs['logy'] == 1:
                ax.set_yscale('log')
        elif kind == 'histogram':
            bins = int(kwargs['bins'])
            axs = data.plot(kind='hist',layout=layout, ax=ax, **kwargs)
        elif kind == 'heatmap':
            axs = self.heatmap(data, ax, kwargs)
        elif kind == 'bootstrap':
            axs = plotting.bootstrap_plot(data)
        elif kind == 'scatter_matrix':
            axs = pd.scatter_matrix(data, ax=ax, **kwargs)
        elif kind == 'hexbin':
            x = cols[0]
            y = cols[1]
            axs = data.plot(x,y,ax=ax,kind='hexbin',gridsize=20,**kwargs)
        else:
            axs = data.plot(ax=ax, layout=layout, **kwargs)
        return axs

Esempio n. 24

0

Mostra file

def realiseData():
    data = pd.read_csv(csvPath)
    # data.plot(kind='density', subplots=True, layout=(3, 3), sharex=False)
    pd.scatter_matrix(data)
    plt.show()

Esempio n. 25

0

Mostra file

from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
"""
DBSCN
    describe:
       核心对象： 某个点的密度达到阈值(minPts)则为核心点
       邻域阈值(r)

       传销算法


"""

colors = np.array(['red', 'green', 'blue', 'yellow'])

# 读取数据
beer = pd.read_csv('./data/data.txt', sep=' ')
X = beer[["calories", "sodium", "alcohol", "cost"]]

# dbscan
db = DBSCAN(eps=10, min_samples=2).fit(X)

beer['cluster_db'] = db.labels_
beer.groupby('cluster_db').mean()

pd.scatter_matrix(X, c=colors[beer.cluster_db], figsize=(10, 10), s=100)
plt.show()

Esempio n. 26

0

Mostra file

df.groupby('species').agg(np.mean)
df.groupby('species').agg([np.min, np.max])
df.groupby('species').describe()

# explore data by sorting, looking for differences between species
df.sort_index(by='sepal_length').values
df.sort_index(by='sepal_width').values
df.sort_index(by='petal_length').values
df.sort_index(by='petal_width').values

# explore data visually, looking for differences between species
df.petal_width.hist(by=species, sharex=True)
df.boxplot(column='petal_width', by='species')
df.boxplot(by='species')
df.plot(x='petal_length', y='petal_width', kind='scatter', c=iris.target)
pd.scatter_matrix(df, c=iris.target)

## PART 2: Write a function to predict the species for each observation

# create a dictionary so we can reference columns by name
col_ix = {col: index for index, col in enumerate(df.columns)}


# define function that takes in a row of data and returns a predicted species
def classify_iris(data):
    if data[col_ix['petal_length']] < 3:
        return 'setosa'
    elif data[col_ix['petal_width']] < 1.8:
        return 'versicolor'
    else:
        return 'virginica'

Esempio n. 27

0

Mostra file

File: OKC Assessment code.py Progetto: anilbulusu/All-NBA-selections

scaled_df.describe()


# In[21]:


# Correlation matrix
scaled_df.corr()


# In[22]:


# Correlation plots
pd.scatter_matrix(scaled_df, figsize=(22,22))
plt.show()


# In[23]:


# Correlation heatmap
sns.set(rc={'figure.figsize':(80,10)})

corr = scaled_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True

Esempio n. 28

0

Mostra file

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

wine = pd.read_csv('/Users/Shared/py/winequality-red.csv', sep=';')

clf = linear_model.LinearRegression()

X = wine.drop(['quality'], axis=1)

Y = wine['quality']

clf.fit(X, Y)

print(clf.coef_)
print(clf.intercept_)

print(
    pd.DataFrame({
        "Name": X.columns,
        "Coefficients": clf.coef_
    }).sort_values(by='Coefficients'))
plt.matshow(wine.corr())
pd.scatter_matrix(wine)
plt.scatter(X, Y)

Esempio n. 29

0

Mostra file

File: fruits.py Progetto: suman12345678/datascienceworknew

lookup_fruit_name = dict(
    zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))
lookup_fruit_name

#split the data in test and traing with the target variable fruit_label,random_state like seed in R
X = fruits[['height', 'width', 'mass', 'color_score']]
y = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#visualize data as pairs scatterplot of al independent variable relation with target
from matplotlib import cm
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X_train,
                            c=y_train,
                            marker='o',
                            s=40,
                            hist_kwds={'bins': 15},
                            figsize=(9, 9),
                            cmap=cmap)

#visualize in 3d
# plotting a 3D scatter plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_train['width'],
           X_train['height'],
           X_train['color_score'],
           c=y_train,
           marker='o',

Esempio n. 30

0

Mostra file

ufo_cols = ufo.columns.tolist()
ufo_cols = [names.replace(' ', '_') for names in ufo.columns.tolist()]
ufo_cols2 = [names.replace(' ', '_') for names in ufo.columns]
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns = ufo_cols
# ufo.Location = ufo.City + ', ' + ufo.State
ufo['Location'] = ufo.City + ', ' + ufo.State

users = pd.read_table('u.user', sep='|', index_col='user_id')
users.groupby('occupation').count()
users.occupation.value_counts()
users.groupby('occupation').age.mean()
users.groupby('occupation').age.agg(['min', 'max'])
users.groupby(['occupation', 'gender']).age.mean()
users.groupby(['occupation', 'gender']).age.agg(['mean', 'count'])

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)

drinks[['beer', 'wine']].sort('beer').values
drinks.plot(kind='scatter', x='beer', y='wine', alpha=.3)
plt.xlabel('Beer')
plt.ylabel('Wine')
pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8))
plt.style.use('ggplot')
drinks.continent.value_counts().plot(kind='bar')
drinks.groupby('continent').mean().plot(kind='bar', figsize=(10, 8))
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar')
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar',
                                                               stacked=True)

Esempio n. 31

0

Mostra file

def scatterplot(data, title=None, color=None):
    pd.scatter_matrix(data, alpha=0.3, diagonal='kde', color=color)
    if title is not None:
        plt.suptitle(title)
    plt.show()

Esempio n. 32

0

Mostra file

File: 06_regression_class.py Progetto: zehndec/DAT3

# Plot the data (similar to before)
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)
"""
COMMON PROBLEMS - Multicollinearity
"""

# Now let's run a multiple linear regression
# The temp variable is no longer significant. Why? Multicollinearity
est_m = smf.ols(formula='cnt ~ atemp + temp + workingday + windspeed',
                data=bike_dat).fit()
est_m.summary()

# Scatter plot (observe the (unsurprising) correlation between atemp and temp)
cols = ['cnt', 'atemp', 'windspeed', 'weathersit', 'temp', 'workingday', 'hum']
pd.scatter_matrix(bike_dat[cols])

# Correlation coefficient matrix
corr_matrix = np.corrcoef(bike_dat[cols].T)
sm.graphics.plot_corr(corr_matrix, xnames=cols)

# Let's say we wanted to include an interaction term
# We would do this by including the ':' between interacting variables
est_m = smf.ols(formula='cnt ~ temp + windspeed + temp:windspeed + workingday',
                data=bike_dat).fit()

est_m.summary()

# An alternate way of specifying interaction terms
# a*b is equivalent to a + b + a:b
est_m = smf.ols(formula='cnt ~ temp*windspeed + workingday',

Esempio n. 33

0

Mostra file

File: lesson6_viz_ts.py Progetto: rachidalili/MS-BGD2015

wiki_data = wiki_data.set_index('Date')
wiki_data.index = wiki_data.index.map(lambda x : parse(x))
wiki_data['changes'] = wiki_data['changes'].astype(int)



death_data = pd.read_csv('CausesOfDeath_France_2001-2008.csv')
death_data['Value'] = death_data['Value'].str.replace(' ','')
death_data['Value'] = death_data['Value'].apply(lambda x : int(re.compile(r'[^0-9]').sub('0',x)))
death_data = death_data[['ICD10','Value','SEX','TIME']]

causes = death_data.groupby('ICD10')['Value'].sum().order(ascending=False)[0:5].index.values

filtered = death_data[death_data['ICD10'].isin(causes)]

filtered_agg = filtered.groupby(['ICD10','TIME']).sum()

filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot()
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="bar")
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh")
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh", stacked=True)

cars = pd.read_csv('cars.csv',sep=';',index_col=0).drop('STRING')
cars['MPG'] = cars['MPG'].astype(float)
cars['Cylinders'] = cars['Cylinders'].astype(float)
cars['Weight'] = cars['Weight'].astype(float)
cars['Acceleration'] = cars['Acceleration'].astype(float)
cars['Horsepower'] = cars['Horsepower'].astype(float)
pd.scatter_matrix(cars, diagonal='kde', color='k', alpha=0.3)

Esempio n. 34

0

Mostra file

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ml-bank').getOrCreate()
df = spark.read.csv('bank.csv', header=True, inferSchema=True)
df.printSchema()

import pandas as pd

pd.DataFrame(df.take(5), columns=df.columns).transpose()

numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
print(df.select(numeric_features).describe().toPandas().transpose())

numeric_data = df.select(numeric_features).toPandas()
axs = pd.scatter_matrix(numeric_data, figsize=(8, 8))
n = len(numeric_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n - 1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

df = df.select('age', 'job', 'marital', 'education', 'default', 'balance',
               'housing', 'loan', 'contact', 'duration', 'campaign', 'pdays',
               'previous', 'poutcome', 'deposit')
cols = df.columns
print(df.printSchema())

Esempio n. 35

0

Mostra file

dataset['quality'].unique()#3-9

dataset.head()

dataset.tail()

#To find the statistical summary
dataset.describe()

#Univariate Analysis
dataset.hist()

#Multivariate Analysis
from pandas.tools.plotting import scatter_matrix

pd.scatter_matrix(dataset)

#Group the dependent variable and independent variables
array=dataset.values
X=array[:,0:11]
Y=array[:,11]

#Splitting the dataset into training set and test set
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)


# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)

Esempio n. 36

0

Mostra file

File: 05_iris_exercise.py Progetto: shanniemurd/sklearn-basics

iris.petal_width.hist(by=iris.species, sharex=True)
iris.boxplot(column='petal_width', by='species')
iris.boxplot(by='species')

# map species to a numeric value so that plots can be colored by category
iris['species_num'] = iris.species.map({
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica': 2
})
iris.plot(kind='scatter',
          x='petal_length',
          y='petal_width',
          c='species_num',
          colormap='Blues')
pd.scatter_matrix(iris, c=iris.species_num)

## TASK 4

# If petal length is less than 3, predict setosa.
# Else if petal width is less than 1.8, predict versicolor.
# Otherwise predict virginica.

## BONUS


# define function that accepts a row of data and returns a predicted species
def classify_iris(row):
    if row[2] < 3:  # petal_length
        return 0  # setosa
    elif row[3] < 1.8:  # petal_width

Esempio n. 37

0

Mostra file

centers = beer.groupby("cluster3").mean().reset_index()
print(centers)

# 图形化展示聚类效果(k=3)
from pandas import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.size'] = 14
colors = np.array(['red', 'green', 'blue', 'yellow'])
plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster3"]])

plt.scatter(centers.calories,
            centers.alcohol,
            linewidths=3,
            marker='+',
            s=300,
            c='black')

plt.xlabel("Calories")
plt.ylabel("Alcohol")
plt.show()

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]],
               s=100,
               alpha=1,
               c=colors[beer["cluster3"]],
               figsize=(10, 10))
plt.suptitle("With 3 centroids initialized")
plt.show()

Esempio n. 38

0

Mostra file

print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

# create dataframe from data in X_train
# label the columns using the strings in iris_dataset.feature_names

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
# create a scatter matrix from the dataframe, color by y_train
grr = pd.scatter_matrix(iris_dataframe,
                        c=y_train,
                        figsize=(15, 15),
                        marker='o',
                        hist_kwds={'bins': 20},
                        s=60,
                        alpha=.8,
                        cmap=mglearn.cm3)
#pip install mglearn

#Building Your First Model: k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
#knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)

#Making Predictions
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape: {}".format(X_new.shape))

Esempio n. 39

0

Mostra file

File: titanic_solution.py Progetto: krother/python3_grundlagenkurs

sv = df.groupby(['Survived', 'Pclass', 'Sex'])['Name'].count()
sv.unstack().plot.bar()
plt.savefig('bars_gruppen.png')


# 5. Paarplot
def make_col(x):
    """Einfärben nach Überleben"""
    if x == 0:
        return (1, 0, 0)  # rot
    else:
        return (0, 0, 1)  # blau


col = df['Survived'].apply(make_col)
pd.scatter_matrix(df, c=col, figsize=(15, 15))
plt.savefig('paarplot.png')

# 7. Datenaufbereitung
del df['Cabin']
del df['Name']

df = df.dropna()

X = df[['Pclass', 'Age']].values
y = df['Survived'].values

# 8. Modell erstellen
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

m = KNeighborsClassifier(n_neighbors=1)

Esempio n. 40

0

Mostra file

File: Challenge_ Data Visualization-184.py Progetto: JKChang2015/Data-Analysis-Python

ax1.scatter(hollywood_movies["Profitability"], hollywood_movies["Audience Rating"])
ax1.set_xlabel("Profitability")
ax1.set_ylabel("Audience Rating")
ax1.set_title("Hollywood Movies, 2017-2011")
ax2.scatter(hollywood_movies["Audience Rating"], hollywood_movies["Profitability"])
ax2.set_xlabel("Audience Rating")
ax2.set_ylabel("Profitability")
ax2.set_title("Hollywood Movies, 2017-2011")
plt.show()


## 3. Scatter matrix - profitability and critic ratings ##

normal_movies = hollywood_movies[hollywood_movies["Film"] != "Paranormal Activity"]
filtered_movies = normal_movies[["Profitability","Audience Rating"]]
pd.scatter_matrix(filtered_movies,figsize = (6,6))
plt.show()


## 4. Box plot - audience and critic ratings ##

normal_movies.boxplot(column = ["Critic Rating","Audience Rating"])

## 5. Box plot - critic vs audience ratings per year ##

normal_movies = normal_movies.sort_values("Year")
fig = plt.figure(figsize = (8,4))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sns.boxplot(data=normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year",y = "Critic Rating", ax = ax1)
sns.boxplot(data = normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year", y = "Audience Rating", ax = ax2)

Esempio n. 41

0

Mostra file

File: linear_regression.py Progetto: subuv/Data-Science-Projects

    '640', '', '', '', '660', '', '', '', '680', '', '', '', '700', '720', '',
    '', '', '740', '', '', '', '760', '', '', '', '780', '', '', '', '800', '',
    '', '', '820', '', '', '', '840'
])
q0 = p.set_xlabel('FICO Score')
q1 = p.set_ylabel('Interest Rate %')
q2 = p.set_title('Lending Rate Plot')

#Create a new data frame with selected columns for analysing data
loansmin = loansdata.filter([
    'Interest.Rate', 'FICO.Score', 'Loan.Length', 'Monthly.Income',
    'Amount.Requested'
],
                            axis=1)

a = pd.scatter_matrix(loansmin, alpha=0.05, figsize=(10, 10), diagonal='hist')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(10, 10), diagonal='kde')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(8, 8), diagonal='kde')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(12, 12), diagonal='kde')

interest_rate = loansmin['Interest.Rate']
loan_amount = loansmin['Amount.Requested']
fico_score = loansmin['FICO.Score']

y = np.matrix(interest_rate).transpose()
x1 = np.matrix(fico_score).transpose()
x2 = np.matrix(loan_amount).transpose()

x = np.column_stack([x1, x2])

X = sm.add_constant(x)

Esempio n. 42

0

Mostra file

File: ml_proc_20160525b.py Progetto: rghiglia/ML_Process

#
#    plt.xticks(np.arange(len(frame)), values)
#    plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8)
#
## Common attributes for plot formatting
#plt.xlabel(key)
#plt.ylabel('Number of Passengers')
#plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key))
#plt.show()



# Then look at correlations
# This will also be quite problem-specific since mixture of variables are tricky
# In principle I'd like to see some joint stats
pd.scatter_matrix(data_trn, alpha=0.3, figsize=(5,6), diagonal='kde');
# In case of mixed data this really doesn't give you a good sense of relationships
# I guess you might split into continuous and categorical, but still how about the relationship between continuous and categorical?
# Note: L-shaped pairs of variables: if you sum or take the product you get stuff that is more constant or maybe linear, maybe it tells you something
# You have all kind of 'garbage' continuous with categorical or binary and 
# all combos of those

# Maybe you can try to see a pair and the class
clr = ['r', 'b', 'y', 'm', 'c', 'k']
col_i = 'SibSp'
col_j = 'Parch'
# Adding some random noise to distinguish the dots
Z = DataFrame(np.random.rand(nTrn,2), index=data_trn.index)
dxy = 0.45
for j in range(len(set(y_trn))):
    ix = y_trn==j

Esempio n. 43

0

Mostra file

File: 02_pandas.py Progetto: rajgottipati/gopi

drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', alpha=0.3)

# same scatterplot, except point color varies by 'spirit_servings'
# note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0
drinks.plot(kind='scatter',
            x='beer_servings',
            y='wine_servings',
            c='spirit_servings',
            colormap='Blues')

# same scatterplot, except all European countries are colored red
colors = np.where(drinks.continent == 'EU', 'r', 'b')
drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', c=colors)

# scatterplot matrix of all numerical columns
pd.scatter_matrix(drinks)
'''
Advanced Filtering (of rows) and Selecting (of columns)
'''

# loc: filter rows by LABEL, and select columns by LABEL
users.loc[1]  # row with label 1
users.loc[1:3]  # rows with labels 1 through 3
users.loc[1:3,
          'age':'occupation']  # rows 1-3, columns 'age' through 'occupation'
users.loc[:,
          'age':'occupation']  # all rows, columns 'age' through 'occupation'
users.loc[[1, 3], ['age',
                   'gender']]  # rows 1 and 3, columns 'age' and 'gender'

# iloc: filter rows by POSITION, and select columns by POSITION

Esempio n. 44

0

Mostra file

File: pres2012.py Progetto: wandergram/datsci

# performance in Paris
pres[pres.dep=="PARIS"]

''' VISUALIZATION '''

pres.ump.plot(kind='hist', bins=20)
pres.ps.plot(kind='hist', bins=20)
pres.fn.plot(kind='hist', bins=20)

pres[['ump', 'ps']].sort('ump').values
pres.plot(kind='scatter', x='ps', y='ump') # fits hypothesis: higher UMP votes, lower PS votes
pres.plot(kind='scatter', x='ump', y='fn') # line not as evident; but votes may have been interchangeable

# demonstration of vote distribution relationships between binomes
pd.scatter_matrix(pres[['ump', 'ps', 'fn']], figsize=(10, 8))

pres[['ump', 'ps', 'fn']].plot(kind='hist', stacked=True)

# testing hypothesis of voters "so far on the left they come out on the (far) right"
pd.scatter_matrix(pres[['fn', 'ug1', 'ug2']], figsize=(10, 8))
# ^^ it works!

pd.scatter_matrix(pres[['fn', 'ug2', 'ug3']], figsize=(10, 8))


'''
Data source: http://data.gouv.fr

Data desc:

Esempio n. 45

0

Mostra file

File: process_all_datasets.py Progetto: juanerolon/health-data-manip

    print("acc_train = {}, acc_test ={}".format(acc_train, acc_test))
    print("Confusion Matrix:\n{}\n\n {} \n".format(CML, CM))
    print("f1_train = {}, f1_test ={}".format(f1_train, f1_test))
    print("fbeta_train = {}, fbeta_test ={}".format(fb_train, fb_test))
    print("ROC_AUC_train = {}, ROC_AUC_test ={}".format(
        roc_auc_train, roc_auc_test))

############################################# Initial Visual Tests #####################################################
########## ScatterMatrixPlot ##########

if False:
    #Transformed features

    pd.scatter_matrix(biochemistry_data,
                      alpha=0.3,
                      figsize=(16, 8),
                      diagonal='kde')
    plt.show()

if False:

    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    from sklearn.decomposition import PCA

    ndims = 2
    dim_labels = []
    for i in range(1, ndims + 1):
        dim_labels.append("Dimension {}".format(i))

Esempio n. 46

0

Mostra file

File: ex9.py Progetto: zechfox/isl_exercises

fileName = r'../dataSet/Auto.csv'
#if 'coerce', then invalid parsing will be set as NaN
df = pd.read_csv(fileName)
df_numeric = df.apply(pd.to_numeric, args=('coerce',))
mask = ~np.isnan(df_numeric['cylinders'].values) & ~np.isnan(df_numeric['displacement'].values)\
       & ~np.isnan(df_numeric['horsepower'].values) & ~np.isnan(df_numeric['weight'].values)\
       & ~np.isnan(df_numeric['acceleration'].values) & ~np.isnan(df_numeric['year'].values)\
       & ~np.isnan(df_numeric['origin'].values)
X_raw = df_numeric[['cylinders','displacement','horsepower','weight','acceleration','year','origin']][mask]
y = df_numeric['mpg'][mask]

X = sm.add_constant(X_raw)
est = sm.OLS(y,X).fit()
print('Exercise 9 Answer:')
print('(a) see figure 1')
pd.scatter_matrix(df, alpha=0.5)
print('(b) ')
#correlations = np.corrcoef(pd.concat([y, X_raw], axis=1), rowvar=0)
correlations = np.corrcoef(df_numeric.loc[:,'mpg':'origin'][mask], rowvar=0)
print('(c)')
print(est.summary())
print('(c) i. The null-hypersis of all the regression coefficients are zero can be reject by large F-statistic with very small P-value.')
print('(c) ii. From P-value of each predictor, all predictor has statistically significant relationship to the response except cylinders, horsepower and acceleration.')
print('(c) iii. The coefficient of year show positive relationship. And increase of 1 year gain 0.7508 increase of mpg. It\'s means cars become more fuel efficient by year.')
print('(d) see figure 2.')
plt.figure(2)
# R plot for lm object will generate 6 plots: residuals against fitted values, sqrt(|residuals|) against fitted values, Normal Q-Q plot,
#Cook's distances versus row lables, residuals against leverages, and Cook's distances against leverage. By default, the first 3 and 5 are provided
# we plot default by python   

#residuals vs fitted values

Esempio n. 47

0

Mostra file

File: schiffe.py Progetto: krother/python3_grundlagenkurs


# Aufgabe 2
#
# Verschaffe Dir einen Überblick
# über die Werte der Spalten *Art* und *Status*.
print("\nArten von Schiffen:")
print(df['Art'].value_counts())
print("\nStatus von Schiffen:")
print(df['Status'].value_counts())


# Aufgabe 3
#
# Schaue nach möglichen Korrelationen.
pd.scatter_matrix(df)
plt.savefig('matrix.png')


# Aufgabe 4
#
# Plotte Länge gegen Höhe als Streudiagramm.
df.plot.scatter('Länge', 'Höhe')
plt.savefig('scatter.png')


# Aufgabe 5
#
# Einer der Einträge enthält einen **Datenfehler**.
print("\nEintrag mit Datenfehler:")
print(df.ix['HMS Hood'].transpose())

Esempio n. 48

0

Mostra file

import scipy
import numpy
import matplotlib
import pandas
import sklearn

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

# head
print(dataset.head(20))

# descriptions
print(dataset.describe())

dataset.plot(kind='box',
             subplots=True,
             layout=(2, 2),
             sharex=False,
             sharey=False)
matplotlib.pyplot.show()

dataset.hist()
matplotlib.pyplot.show()

# scatter plot matrix
pandas.scatter_matrix(dataset)
matplotlib.pyplot.show()

Esempio n. 49

0

Mostra file

    score = r2_score(pred, y_test)

    scores.append(score)

#calculate mean of all 1000 scores
score = np.mean(scores)
print "\nR^2 score for predicting Milk is: ", score

#OBSERVATION
#A low r^2 value indicates that it cannot be predicted with too much accuracy using all the feautres we have. However since there is a positive value, there must be some features which can predict its value to a higher accuracy and hence it fits the data. So we should keep this feature for identifying customer habits.

##################################################################################################################
''' VISUALIZATION OF FEATURE DATA'''

#viualize data with diagnol showing data distribution
pd.scatter_matrix(data, alpha=0.3, figsize=(14, 8), diagonal='kde')
#plt.show()
'''FEATURE SCALING USING LOG'''

# Scale the data using the natural logarithm
log_data = np.log(data)

# Scale the sample data using the natural logarithm
log_samples = np.log(samples)

# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha=0.3, figsize=(14, 8), diagonal='kde')
plt.show()

print "\nScaled sampled data:\n"
print log_samples

Esempio n. 50

0

Mostra file

File: 8_Plotting_and_Visulization.py Progetto: holmes1313/python_for_data_science

# Scatter plots
macro = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \
                    '\macrodata.txt')
macro.head()

data = macro[['cpi','m1','tbilrate','unemp']]
data.head()

data.head()
trans_data = np.log(data).diff().dropna()

plt.scatter(trans_data['m1'], trans_data['unemp'])
plt.title('Changes in log %s vs. log %s' % ('m1','unemp'))

pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)


#  Plotting map

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \
                   '\ch08_Haiti.csv')
data.info()
data.head()
data.shape
data.columns

data[['INCIDENT DATE', 'LATITUDE','LONGITUDE']][:10]

Esempio n. 51

0

Mostra file

File: Bokito_GapFractionCorrelation.py Progetto: itohanosa/CanopyGapPrediction

colors = Bok_GmGFs['VV-VH']

plt.scatter(Bok_GmGFs['gap_fraction'], Bok_GmGFs['VH-VVnorm'], c=colors, alpha=0.3, cmap='viridis')
plt.ylabel("Normalized VH-VV Backscatter (Gamma0 dB)")
plt.xlabel("Canopy Gap Fraction")
plt.colorbar();
plt.savefig("Correlation_VH-VVNormVsGapFraction.tiff", dpi=300)
plt.savefig("Correlation_VH-VVNormVsGapFraction.pdf", dpi=300)
#plt.legend()

Bok_GmGFs2 = Bok_GmGFs.drop('Year', 1)
Bok_GmGFs2 = Bok_GmGFs2.drop(Bok_GmGFs2.columns[[0, 1]], axis=1)

Bok_GmGFs2 = pd.DataFrame(Bok_GmGFs2)
pd.scatter_matrix(Bok_GmGFs2, alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show
plt.savefig("Scatter_Gamma0_Bands_GapFraction.tiff", dpi=300)
plt.savefig("Scatter_Gamma0_Bands_GapFraction.pdf", dpi=300)


pp = sns.pairplot(data = Bok_GmGFs,
                  y_vars =['gap_fraction'],
                  x_vars = ['VH-VVnorm','VVVHratio','VV-VH'])
plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.tiff", dpi=300)
plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.pdf", dpi=300)

Bok_GmGFs.describe() # ger summary statistics of each variable in Bok_GmGFs

'''
PlotID', 'SARdate', 'VHgamma0', 'VVgamma0', 'VHdb', 'VVdb', 'VV-VH',

Esempio n. 52

0

Mostra file

scatter_matrix(dataset)

# Plotting Graph
plt.scatter(dataset['total_rooms'], dataset['total_bedrooms'])
plt.show()

plt.scatter()
x = np.arange(-10, 10, 0.01)
y = 0.7 * x + 5
plt.plot(x, y)
plt.show()

y1 = 0.7 * x**2 + x + 8
plt.plot(x, y1)
plt.show()

sig_y = 1 / (1 + np.power(np.e, -x))
plt.plot(x, sig_y)
plt.show()

a = np.random.randn(10)
b = np.random.randn(5, 5)

pd.scatter_matrix(dataset.loc[:, :])
pd.show_versions(as_json=False)

corr_mat = dataset.corr()
sns.heatmap(corr_mat, annot=True)

np.arange(23, 55, 2)
np.linspace(0, 100, 6)

Esempio n. 53

0

Mostra file

File: 05_pandas.py Progetto: RobertMarton/DAT4

# boxplot of beer servings by continent (shows five-number summary and outliers)
drinks.boxplot(column="beer_servings", by="continent")

# scatterplot of beer servings versus wine servings
drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", alpha=0.3)

# same scatterplot, except point color varies by 'spirit_servings'
# note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0
drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", c="spirit_servings", colormap="Blues")

# same scatterplot, except all European countries are colored red
colors = np.where(drinks.continent == "EU", "r", "b")
drinks.plot(x="beer_servings", y="wine_servings", kind="scatter", c=colors)

# scatterplot matrix of all numerical columns
pd.scatter_matrix(drinks)


"""
Advanced Filtering (of rows) and Selecting (of columns)
"""

# loc: filter rows by LABEL, and select columns by LABEL
users.loc[1]  # row with label 1
users.loc[1:3]  # rows with labels 1 through 3
users.loc[1:3, "age":"occupation"]  # rows 1-3, columns 'age' through 'occupation'
users.loc[:, "age":"occupation"]  # all rows, columns 'age' through 'occupation'
users.loc[[1, 3], ["age", "gender"]]  # rows 1 and 3, columns 'age' and 'gender'

# iloc: filter rows by POSITION, and select columns by POSITION
users.iloc[0]  # row with 0th position (first row)

Esempio n. 54

0

Mostra file

File: Data Preparation.py Progetto: GajdoM/Thesis

ForwardU = Forward1.loc[Forward1.Status=='UFA',:]
ForwardR = Forward1.loc[Forward1.Status=='RFA',:]

Correlation

#goalies 
#correlation across category
Gcor = G1617.loc[:, ['Ovrl', 'SV%', 'Supp', 'ReMin', 'HighSV%', 'PP SV%', 'FA', 'SO SV%', 'Cap Hit', 'Ginj']]
Gcor.corr()
plt.matshow(Gcor.corr())
plt.xticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue', rotation = 'vertical')
plt.yticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue')
plt.colorbar()
plt.show()

pd.scatter_matrix(Gcor, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue')
plt.show()

#correlation from one select category
Gcor2 = G1617.loc[:, ['GP', 'W', 'L', 'SA', 'SV', 'GA', 'SV%']]
Gcor2.cov()
plt.matshow(Gcor2.corr())
plt.xticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue', rotation = 'vertical')
plt.yticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue')
plt.colorbar()
plt.show()

pd.scatter_matrix(Gcor2, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue')
plt.show()

#players

Esempio n. 55

0

Mostra file

File: Plotting.py Progetto: AntHar/DAT4-students


'''
PLOTS
'''
'''
Creates a df with only the numerical columns for a scatter matrix

RESULT: Nearly all of the independent variables follow some sort of power law distribution
'''
Numerical_df = Master_df[['Num_Adv_Event','Num_Serious',
                'Num_Other','Num_Life_Threat','Num_Hosp',
                'Num_Congen_Anom','Num_Disable','Num_Deaths',
                'Num_Male','Num_Female','AE_Per_Year','Adj_Num_AE', 'Adj_Per_Year']]

pd.scatter_matrix(Numerical_df, diagonal='kde')

'''
Correlation matrix

RESULT: Num_Adv_Event is highly correlated (>0.60 with every other column
except for Num_Congen_Anom, Num_Disable and Num_Deaths
'''
Corr_matrix = Master_df.corr()
Corr_matrix.to_csv('C:\Users\jonbryan90\Desktop\Corr_Matrix')

'''
Density plots by Innovation_Cat for the promising variabes (Num_Adv_Event, Num_Congen, Num_Disabe, Num_Deaths)
'''
Master_df.groupby('Innovation_Cat').Num_Adv_Event.plot(kind='kde',
                                                      linewidth=2.5,

Esempio n. 56

0

Mostra file

File: EmailCampaign.py Progetto: SanjeevSukumaran1990/emailcampaignprediction

        'id', 'RR', 'C_S', 'U_U_C', 'A_D_R_R', 'a_d_i_r', 'a_d_a_r_r',
        'a_u_d_a_r_r', 'mb_s', 'mb_e', 'mb_sub', 'mb_esec', 'mb_inp',
        'mb_insec', 'mb_uneng', 'mb_idles'
    ]
    ax.set_xticklabels(labels, fontsize=10)
    ax.set_yticklabels(labels, fontsize=6)
    ax.matshow(corr)

    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)


plot_corr(input, 15)

from pandas import scatter_matrix
scatter_matrix(input, diagonal='kde')

san = input.corr()
corr = pd.DataFrame(san)
#plotting categorical variables

san = input.day
san.value_counts().plot(kind='bar')

#looking for unique domains:
len(set(input.from_domain_hash))

#sendex approach

# anova test for weekly data
from statsmodels.formula.api import ols

Esempio n. 57

0

Mostra file

File: linear_regression.py Progetto: peterstephens4/linear_regression

#  Clean Data:  Remove null value rows
loansData.dropna(inplace=True)

loansData['Interest.Rate'] = loansData['Interest.Rate'].map(lambda x: float(x.rstrip('%')))
loansData['Loan.Length']   = loansData['Loan.Length'].map(lambda x: int(x.rstrip('months')))
loansData['FICO.Score']    = loansData['FICO.Range'].map(lambda x: int(x.split('-')[0]))

#  Create Histogram of FICO scores 
plt.figure()
a = loansData['FICO.Score'].hist()
plt.savefig("Bar_Plot_FICO_Score.png")

#  Create Scatter Matrix of loan data
plt.figure()
a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist')
plt.savefig("Scatter_Matrix_Loan_Data.png")

#  Create Scatter Plot of loan data (FICO vs Interest Rate)
plt.figure()
a = loansData.plot.scatter(x = 'FICO.Score', y = 'Interest.Rate')
plt.savefig("Scatter_Plot_Loan_Data.png")

# The dependent variable
y = np.matrix(loansData['Interest.Rate']).transpose()

# The independent variables shaped as columns
x1 = np.matrix(loansData['FICO.Score']).transpose()
x2 = np.matrix(loansData['Amount.Requested']).transpose()
x = np.column_stack([x1,x2])

Esempio n. 58

0

Mostra file

File: c5_visualize_data.py Progetto: anuj3918/machine_learning

9. class = Class variable (0 or 1) 
'''

names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
dataframe = pd.read_csv(url, names=names)
print type(dataframe)

# df_head = dataframe.head()
# print df_head

# df_shape = dataframe.shape
# print df_shape

# df_dtypes = dataframe.dtypes
# print df_dtypes

# df_describe = dataframe.describe()
# print df_describe

# df_correlation = dataframe.corr()
# print df_correlation

plt.figure()
# dataframe.plot.hist(by='age')
# dataframe['age'].plot.hist()
# dataframe.plot.box(by='age')
# dataframe.plot(kind='box')
pd.scatter_matrix(dataframe)
plt.show()

Esempio n. 59

0

Mostra file

File: knn.py Progetto: btrani/projects

#Calculate average sale price by zip code as proxy for zip code
avg_by_zip = df.groupby(['ZIP CODE'])['SALE PRICE'].median().reset_index()
avg_by_zip.columns = ['ZIP CODE', 'avg_sale_by_zip']
df = pd.merge(df, avg_by_zip, on='ZIP CODE', how='outer')

#Transform sale price using log normal function to normalize data
def log(x):
    return math.log(x)

df['log_sale'] = df['SALE PRICE'].apply(log)
df['log_avg_sale'] = df['avg_sale_by_zip'].apply(log)
df['gsf_log'] = df['GROSS SQUARE FEET'].apply(log)

#Investigate potential relationships via scatter matrix
a = pd.scatter_matrix(df, figsize = (10,10), diagonal='hist')

#Split into train and test data sets
labels = df['log_sale']
df_clean = df[['TOTAL UNITS', 'avg_sale_by_zip', 'GROSS SQUARE FEET']]

X_train, X_test, y_train, y_test = train_test_split(df_clean, labels, \
test_size=0.2, random_state=0)

#Prep independent and dependent variables for regression
y = np.matrix(y_train).transpose()

#Fit the OLS model
X = sm.add_constant(X_train)
model = sm.OLS(y, X_train)
fitted = model.fit()

Esempio n. 60

0

Mostra file

File: knn.py Progetto: kkakade6/machine_learning

from sklearn.pipeline import Pipeline  #imputing within a pipeline
from sklearn.svm import SVC  #support vector classification

plt.style.use('ggplot')

iris = datasets.load_iris()
type(iris)
print(iris.keys())
type(iris.data), type(iris.target)
iris.data.shape
iris.target_names
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
print(df.head())
_ = pd.scatter_matrix(df, c=y, figsize=[8, 8], s=150, marker='D')

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X, y)
y_pred = knn.predict(X)
new_prediction = knn.predict(X)
print("Prediction: {}".format(new_prediction))
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
confusion_matrix(y_test, y_pred)
classification_report(y_test, y_pred)