Esempio n. 1
0
    def plot_feature_relationships(self, df : pd.DataFrame, cols=[], postfix='') -> None:

        no_date_df = df.drop(columns=['date'])

        if not cols:
            cols = list(no_date_df.columns)

        scatterplotmatrix(no_date_df.values, names=cols, alpha=0.7)
        plt.savefig(f'feature_relationships{postfix}.png')
Esempio n. 2
0
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.plotting import scatterplotmatrix
import numpy as np
from mlxtend.plotting import heatmap

# Load dataset
diabetes = pd.read_csv('diabetes.csv', header=0)
diabetes.columns = ['PREG', 'GLU', 'BP', 'SKIN', 'INSU', 'BMI', 'DPF', 'AGE', 'OUT']
features = ['PREG', 'GLU', 'BP', 'SKIN', 'INSU', 'BMI', 'DPF', 'AGE']
X = diabetes[features].values
y = diabetes['OUT'].T

# EDA
cm = np.corrcoef(diabetes[diabetes.columns].values.T)
hm = heatmap(cm, row_names=diabetes.columns, column_names=diabetes.columns)

scatterplotmatrix(diabetes[diabetes.columns].values, figsize=(10, 8), names=diabetes.columns, alpha=0.4)
plt.show()
import matplotlib.pyplot as plt

# In[6]:

from mlxtend.plotting import scatterplotmatrix

# In[7]:

cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']

# In[ ]:

# In[8]:

scatterplotmatrix(df[cols].values, figsize=(10, 8), names=cols, alpha=0.5)
plt.tight_layout()
plt.show()

# In[9]:

from mlxtend.plotting import heatmap

# In[10]:

import numpy as np

# In[11]:

cm = np.corrcoef(df[cols].values.T)

# Main program
if __name__ == '__main__':
    # Load full data set
    fileName = 'heights_weights.xlsx'
    dataFull = loadData(fileName, '.xlsx')
    print('\n')

    # Extract columns from data set
    columnNames = dataFull.columns
    columnNames = [columnNames[1], columnNames[2]]

    # Plot data
    scatterplotmatrix(dataFull[columnNames].values,
                      figsize=(10, 9),
                      names=columnNames,
                      alpha=0.5)
    plt.tight_layout()
    plt.show()

    # Specify feature and target variables
    X = dataFull[columnNames[0]].values  # height
    y = dataFull[columnNames[1]].values  # weight

    # Split data into training (80%)/test data (20%) sets
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    X_train = X_train[:, np.newaxis]
    y_train = y_train[:, np.newaxis]
    X_test = X_test[:, np.newaxis]
    y_test = y_test[:, np.newaxis]
# Avg. Area Number of Bedrooms 0
# Area Population 0
# Price 0
# Address 0

print(
    colored(
        '==================================VISUALIZATION=========================================================',
        'white'))

print(colored('SCATTERPLOT OF VARIABLES', 'red'))
cols = [
    'Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
    'Avg. Area Number of Bedrooms', 'Area Population', 'Price'
]
scatterplotmatrix(df1[cols].values, figsize=(40, 32), names=cols, alpha=0.5)
plt.title('Scatterplot of 6 Variables in a Housing File')
plt.tight_layout()
plt.savefig('Scatterplot_of_6_housing_variables.png')

# Display the plot. By the way, do we need to show the large plot, or is it better to just save it?
#plt.show()

print(colored('PRINTING DATA DESCRIPTION', 'red'))
# Generate summary statistics about the numeric data (6 columns out of 7): count, mean (50th percentile), standard deviation, minimum, 25th percentile, 75th percentile, maximum.
# I want to print the column LABELS on top of the column descriptions.
# Column Names: 'Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price'.
print(df1.describe())

print(
    colored(
Esempio n. 6
0
 def scatter_plot(self, data, cols):
     print("1")
     scatterplotmatrix(data[cols].values, figsize=(10, 10), names=cols, alpha=0.5)
     print("2")
     plt.show()
     print('3')