Example #1
0
def iris_prep(cached=True):
    df = acquire.get_iris_data()
    df = df.drop(columns=['species_id', 'species_id.1']).rename(
        columns={'species_name': 'species'})
    species_dummies = pd.get_dummies(df.species, drop_first=True)
    df = pd.concat([df, species_dummies], axis=1)
    return df
def prep_iris():
    df = get_iris_data()
    df = df.drop(columns='species_id').rename(
        columns={'species_name': 'species'})
    dummy = pd.get_dummies(df.species, drop_first=True)
    df = pd.concat([df, dummy], axis=1)
    return df
Example #3
0
def prep_iris():
    df = acquire.get_iris_data()
    cols_to_drop = ['species_id']
    df = df.drop(columns = cols_to_drop)
    df = df.rename(columns = {"species_name":"species"})
    df_dummies = df_dummies = pd.get_dummies(df[['species']], drop_first = True)
    df = pd.concat([df, df_dummies], axis = 1)
    return df
Example #4
0
def prep_iris():
    df = acquire.get_iris_data()
    df = df.drop(columns=['species_id'])
    df = df.rename(columns={'species_name': 'species'})
    species_dummies = pd.get_dummies(df.species)
    df = pd.concat([df, species_dummies], axis=1)
    
    return df
Example #5
0
def prep_iris():
    iris_df = acquire.get_iris_data()
    iris_df = iris_df.drop(columns= ['species_id', 'measurement_id'])
    iris_df = iris_df.rename(columns = {'species_name': 'species'})
    train, test = sklearn.model_selection.train_test_split(iris_df, random_state=123, train_size=.7)
    train, test = encode_species(train,test)
    
    return train, test
Example #6
0
def clean_iris():
    df = acquire.get_iris_data()
    df.drop_duplicates(inplace=True)
    cols_to_drop = ['species_id', 'measurement_id']
    df = df.drop(columns=cols_to_drop)
    df = df.rename(columns={'species_name' : 'species'})
    dummies = pd.get_dummies(df[['species']])
    df = pd.concat([df, dummies], axis=1)
    return df
Example #7
0
def prep_iris():
    df_iris = acquire.get_iris_data()
    df_iris = df_iris.drop(columns=['species_id'])
    df_iris = df_iris.drop(columns=['measurement_id'])
    df_iris = df_iris.rename(columns={'species_name': 'species'})

    encoder = LabelEncoder()
    df_iris.species = encoder.fit_transform(df_iris.species)
    return df_iris
def prep_iris():
    df_iris = acquire.get_iris_data()
    df_iris.drop(['species_id','measurement_id'],inplace=True,axis=1)
    df_iris = df_iris.rename(columns={"species_name": "species"})

    labelencoder = LabelEncoder()
    labelencoder.fit(df_iris.species)
    df_iris.species = labelencoder.transform(df_iris.species)
    return df_iris
def prep_iris(inverse_transform=False):
    df_iris = acquire.get_iris_data()
    df_iris.drop(columns=['species_id', 'measurement_id'], inplace=True)
    df_iris.rename(columns={'species_name': 'species'}, inplace=True)
    encoder = LabelEncoder()
    encoder.fit(df_iris.species)
    df_iris['species'] = encoder.transform(df_iris.species)
    if inverse_transform:
        df_iris.species = pd.Series(encoder.inverse_transform(df_iris.species))
    return df_iris, encoder
Example #10
0
def prep_iris():
    iris = acquire.get_iris_data()
    iris.drop(columns=['species_id', 'measurement_id', 'species_id.1'],
              inplace=True)
    iris.rename(columns={"species_name": "species"}, inplace=True)
    iris_dummies = pd.get_dummies(iris['species'])
    iris = pd.concat([iris, iris_dummies], axis=1)
    iris['petal_area'] = iris['petal_length'] * iris['petal_width']
    iris['sepal_area'] = iris['sepal_length'] * iris['sepal_width']
    return iris
Example #11
0
def prep_iris(cached = True):
    '''
    This function acquires and prepares the iris data from a local csv, default.
    Passing cached=False acquires fresh data from Codeup db and writes to csv.
    Returns the iris df with dummy variables encoding species.
    '''
    # use my aquire function to read data into a df from a csv file
    df = get_iris_data(cached)
    cols_to_drop = ['species_id','measurement_id']
    df = df.drop(columns=cols_to_drop)
    df = df.rename({'species_name':'species'}, axis = 1)
    dummy_df = pd.get_dummies(df[['species']], dummy_na=False)
    df = pd.concat([df, dummy_df], axis = 1)
    return df
Example #12
0
def clean_iris():
    '''
    clean_iris will take a dataframe acquired as df and remove columns:
        species_id: species_name has same info but more descriptive
        measurement_id: redundant to the index, so no statistical value
    rename species_name to species,
    and add dummy values for the species
    return: single cleaned dataframe
    '''
    df = get_iris_data()
    df['species'] = df.species_name
    dropcols = ['species_id', 'measurement_id', 'species_name']
    df.drop(columns=dropcols, inplace=True)
    dummies = pd.get_dummies(df[['species']], drop_first=True)
    return pd.concat([df, dummies], axis=1)
def prep_iris_data(cached=True):

    # use my aquire function to read data into a df from a csv file
    df = get_iris_data(cached)

    # drop and rename columns
    df = df.drop(columns='species_id').rename(
        columns={'species_name': 'species'})

    # create dummy columns for species
    species_dummies = pd.get_dummies(df.species, drop_first=True)

    # add dummy columns to df
    df = pd.concat([df, species_dummies], axis=1)

    return df
def wrangle_iris_data():
    """
    This function takes acquired iris data, completes the prep
    and splits the data into train, validate, and test datasets
    """
    df = acquire.get_iris_data()
    train, test, validate = prepare.prep_iris_data(df)
    #train_and_validate, test = train_test_split(df, test_size=.15, random_state=123)
    #train, validate = train_test_split(train_and_validate, test_size=.15, random_state=123)
    # return train, test, validate
    train_scaled, validate_scaled, test_scaled = scale_iris(
        train, validate, test)
    return train, validate, test, train_scaled, validate_scaled, test_scaled


####### NOTE: to call wrangle_iris_data
##### train, validate, test, train_scaled, validate_scaled, test_scaled = wrangle_iris_data()
def clean_iris():
    '''
    prep_iris will take a dataframe acquired as df and remove species_id and 
    measurement_id. The function will then rename the species_name col to 'species'
    Finally, the categorical species name will have dummy values created for them and the
    table will be concatanted to bring it all together as on dataframe
    
    return: single cleaned dataframe

    '''

    df = get_iris_data()
    df['species'] = df.species_name
    dropcols = ['species_id', 'measurement_id', 'species_name']
    df.drop(columns=dropcols, inplace=True)
    dummies = pd.get_dummies(df[['species']], drop_first=False)
    pd.get_dummies(df[['species']], drop_first=False)
    return pd.concat([df, dummies], axis=1)
def prep_iris(df=get_iris_data()):
    """
    prep_iris accepts the iris dataset and returns a transformed iris dataset
    for exploratory analysis.
    type(df) >>> pandas.core.frame.DataFrame
    """
    # Drop columns of redundant data or 'index-like'/ordinal row.
    df.drop(columns=['species_id', 'measurement_id'], inplace=True)

    # Rename species_name to be concise.
    df.rename(columns={'species_name': 'species'}, inplace=True)

    # Create dummy variables for our targets - 0 0 represents 'species_setosa'
    encoded_species = pd.get_dummies(df.species, drop_first=True)

    # Add the encoded target names as columns to the dataframe.
    df = pd.concat([df, encoded_species], axis=1)

    return df
Example #17
0
def prep_iris_data(splain=local_settings.splain, **kwargs):
    '''
    prep_iris(splain=local_settings.splain, **kwargs)
    RETURNS: df, encoder

    Iris Data

    1. Use the function defined in acquire.py to load the iris data.
    2. Drop the species_id and measurement_id columns.
    3. Rename the species_name column to just species.
    4. Encode the species name using a sklearn label encoder. Research the 
    inverse_transform method of the label encoder. How might this be useful?
    5. Create a function named prep_iris that accepts the untransformed iris 
    data, and returns the data with the transformations above applied.
    '''
    df = get_iris_data(type='sql', splain=splain)
    df = df.drop(columns='measurement_id', axis=1)
    df = df.rename(columns={'species_name': 'species'})
    df, encoder = encode_col(df=df, col='species')
    return df, encoder
Example #18
0
def prep_iris(cached=True):
    '''
    This function acquires and prepares the iris data from a local csv, default.
    Passing cached=False acquires fresh data from Codeup db and writes to csv.
    Returns the iris df with dummy variables encoding species.
    '''

    # use my aquire function to read data into a df from a csv file
    df = get_iris_data(cached)

    # drop and rename columns
    df = df.drop(columns='species_id').rename(
        columns={'species_name': 'species'})

    # create dummy columns for species
    species_dummies = pd.get_dummies(df.species, drop_first=True)

    # add dummy columns to df
    df = pd.concat([df, species_dummies], axis=1)

    return df
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from acquire import get_iris_data
from prepare import prep_iris

seed = 43

iris = get_iris_data()
iris, encoder = prep_iris(iris)
X = iris[['sepal_width', 'sepal_length', 'petal_width', 'petal_length']]
y = iris[['species']]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.30,
                                                    random_state=seed)

logit = LogisticRegression(random_state=seed, solver='saga')
logit.fit(X_train, y_train)

y_pred = logit.predict(X_train)
score = logit.score(X_train, y_train)
cm = confusion_matrix(y_train, y_pred)
cr = classification_report(y_train, y_pred)
Example #20
0
# Using the Iris Data:
#
# Use the function defined in acquire.py to load the iris data.
#
# Drop the species_id and measurement_id columns.
#
# Rename the species_name column to just species.
#
# Create dummy variables of the species name.
#
# Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

# In[3]:

# Use the function defined in acquire.py to load the iris data.
iris = acquire.get_iris_data()
iris.head()

# In[4]:

# Drop the species_id and measurement_id columns.
cols_to_drop = ['species_id', 'measurement_id']
iris = iris.drop(columns=cols_to_drop)

# In[5]:

iris.columns

# In[6]:

# Rename the species_name column to just species.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_iris_data
from prepare import prepare_iris_data

df = prepare_iris_data(get_iris_data())

X = df.drop(columns=["species"])
y = df[['species']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

# Create the Logistic Regression Object
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

# Fit the model to the training data
logit.fit(X_train, y_train)

# Print the coefficients and intercept of the model
Example #22
0
# %matplotlib inline

import math

from sklearn.linear_model import LinearRegression

from sklearn import metrics

import statsmodels.api as sm

from pprint import pprint

# get data
from acquire import get_iris_data
df = get_iris_data()

# Split data into train (70%) & test (30%) samples. You should end with 2 data frames: train_df and test_df
train_df, test_df = train_test_split(df, test_size=.30, random_state=123)

# Create a swarmplot where the x-axis is each of the independent variable names (petal_length, petal_width, etc). The y-axis is the value of the variable. 
# Use color to represent species as another dimension. Hint: You will to 'melt' the dataframe into a 'long' dataframe in order to accomplish this. What are your takeaways from this visualization?
# THE VIRGINICA SPECIES HAS LARGER MEASUREMENTS OVERALL EXCEPT FOR SEPAL WIDTH
# THE SETOSA SPECIES HAS LOWER MEASUREMENTS OVERALL EXCEPT FOR THE SEPAL WIDTH, WHICH IS THE HIGHEST COMPARED TO OTHER SPECIES.
train_df.head()

df_melted = pd.melt(train_df, id_vars=['species_name'], value_vars=('sepal_length','sepal_width','petal_length','petal_width'), var_name='measurement in cm')
df_melted.head()

plt.figure(figsize=(12,12))
sns.swarmplot(x="measurement in cm", y='value', data=df_melted, hue="species_name")
Example #23
0
import pandas as pd
import numpy as np
from acquire import get_iris_data

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

df_i = get_iris_data()


def prep_iris(df_i):
    df_i.drop(columns=['species_id', 'measurement_id'], inplace=True)
    df_i.rename(columns={'species_name': 'species'}, inplace=True)
    int_encoder = LabelEncoder()
    int_encoder.fit(df_i.species)
    df_i.species = int_encoder.transform(df_i.species)
    species_array = np.array(df_i.species)
    return df_i


prep_iris(df_i)