def iris_prep(cached=True): df = acquire.get_iris_data() df = df.drop(columns=['species_id', 'species_id.1']).rename( columns={'species_name': 'species'}) species_dummies = pd.get_dummies(df.species, drop_first=True) df = pd.concat([df, species_dummies], axis=1) return df
def prep_iris(): df = get_iris_data() df = df.drop(columns='species_id').rename( columns={'species_name': 'species'}) dummy = pd.get_dummies(df.species, drop_first=True) df = pd.concat([df, dummy], axis=1) return df
def prep_iris(): df = acquire.get_iris_data() cols_to_drop = ['species_id'] df = df.drop(columns = cols_to_drop) df = df.rename(columns = {"species_name":"species"}) df_dummies = df_dummies = pd.get_dummies(df[['species']], drop_first = True) df = pd.concat([df, df_dummies], axis = 1) return df
def prep_iris(): df = acquire.get_iris_data() df = df.drop(columns=['species_id']) df = df.rename(columns={'species_name': 'species'}) species_dummies = pd.get_dummies(df.species) df = pd.concat([df, species_dummies], axis=1) return df
def prep_iris(): iris_df = acquire.get_iris_data() iris_df = iris_df.drop(columns= ['species_id', 'measurement_id']) iris_df = iris_df.rename(columns = {'species_name': 'species'}) train, test = sklearn.model_selection.train_test_split(iris_df, random_state=123, train_size=.7) train, test = encode_species(train,test) return train, test
def clean_iris(): df = acquire.get_iris_data() df.drop_duplicates(inplace=True) cols_to_drop = ['species_id', 'measurement_id'] df = df.drop(columns=cols_to_drop) df = df.rename(columns={'species_name' : 'species'}) dummies = pd.get_dummies(df[['species']]) df = pd.concat([df, dummies], axis=1) return df
def prep_iris(): df_iris = acquire.get_iris_data() df_iris = df_iris.drop(columns=['species_id']) df_iris = df_iris.drop(columns=['measurement_id']) df_iris = df_iris.rename(columns={'species_name': 'species'}) encoder = LabelEncoder() df_iris.species = encoder.fit_transform(df_iris.species) return df_iris
def prep_iris(): df_iris = acquire.get_iris_data() df_iris.drop(['species_id','measurement_id'],inplace=True,axis=1) df_iris = df_iris.rename(columns={"species_name": "species"}) labelencoder = LabelEncoder() labelencoder.fit(df_iris.species) df_iris.species = labelencoder.transform(df_iris.species) return df_iris
def prep_iris(inverse_transform=False): df_iris = acquire.get_iris_data() df_iris.drop(columns=['species_id', 'measurement_id'], inplace=True) df_iris.rename(columns={'species_name': 'species'}, inplace=True) encoder = LabelEncoder() encoder.fit(df_iris.species) df_iris['species'] = encoder.transform(df_iris.species) if inverse_transform: df_iris.species = pd.Series(encoder.inverse_transform(df_iris.species)) return df_iris, encoder
def prep_iris(): iris = acquire.get_iris_data() iris.drop(columns=['species_id', 'measurement_id', 'species_id.1'], inplace=True) iris.rename(columns={"species_name": "species"}, inplace=True) iris_dummies = pd.get_dummies(iris['species']) iris = pd.concat([iris, iris_dummies], axis=1) iris['petal_area'] = iris['petal_length'] * iris['petal_width'] iris['sepal_area'] = iris['sepal_length'] * iris['sepal_width'] return iris
def prep_iris(cached = True): ''' This function acquires and prepares the iris data from a local csv, default. Passing cached=False acquires fresh data from Codeup db and writes to csv. Returns the iris df with dummy variables encoding species. ''' # use my aquire function to read data into a df from a csv file df = get_iris_data(cached) cols_to_drop = ['species_id','measurement_id'] df = df.drop(columns=cols_to_drop) df = df.rename({'species_name':'species'}, axis = 1) dummy_df = pd.get_dummies(df[['species']], dummy_na=False) df = pd.concat([df, dummy_df], axis = 1) return df
def clean_iris(): ''' clean_iris will take a dataframe acquired as df and remove columns: species_id: species_name has same info but more descriptive measurement_id: redundant to the index, so no statistical value rename species_name to species, and add dummy values for the species return: single cleaned dataframe ''' df = get_iris_data() df['species'] = df.species_name dropcols = ['species_id', 'measurement_id', 'species_name'] df.drop(columns=dropcols, inplace=True) dummies = pd.get_dummies(df[['species']], drop_first=True) return pd.concat([df, dummies], axis=1)
def prep_iris_data(cached=True): # use my aquire function to read data into a df from a csv file df = get_iris_data(cached) # drop and rename columns df = df.drop(columns='species_id').rename( columns={'species_name': 'species'}) # create dummy columns for species species_dummies = pd.get_dummies(df.species, drop_first=True) # add dummy columns to df df = pd.concat([df, species_dummies], axis=1) return df
def wrangle_iris_data(): """ This function takes acquired iris data, completes the prep and splits the data into train, validate, and test datasets """ df = acquire.get_iris_data() train, test, validate = prepare.prep_iris_data(df) #train_and_validate, test = train_test_split(df, test_size=.15, random_state=123) #train, validate = train_test_split(train_and_validate, test_size=.15, random_state=123) # return train, test, validate train_scaled, validate_scaled, test_scaled = scale_iris( train, validate, test) return train, validate, test, train_scaled, validate_scaled, test_scaled ####### NOTE: to call wrangle_iris_data ##### train, validate, test, train_scaled, validate_scaled, test_scaled = wrangle_iris_data()
def clean_iris(): ''' prep_iris will take a dataframe acquired as df and remove species_id and measurement_id. The function will then rename the species_name col to 'species' Finally, the categorical species name will have dummy values created for them and the table will be concatanted to bring it all together as on dataframe return: single cleaned dataframe ''' df = get_iris_data() df['species'] = df.species_name dropcols = ['species_id', 'measurement_id', 'species_name'] df.drop(columns=dropcols, inplace=True) dummies = pd.get_dummies(df[['species']], drop_first=False) pd.get_dummies(df[['species']], drop_first=False) return pd.concat([df, dummies], axis=1)
def prep_iris(df=get_iris_data()): """ prep_iris accepts the iris dataset and returns a transformed iris dataset for exploratory analysis. type(df) >>> pandas.core.frame.DataFrame """ # Drop columns of redundant data or 'index-like'/ordinal row. df.drop(columns=['species_id', 'measurement_id'], inplace=True) # Rename species_name to be concise. df.rename(columns={'species_name': 'species'}, inplace=True) # Create dummy variables for our targets - 0 0 represents 'species_setosa' encoded_species = pd.get_dummies(df.species, drop_first=True) # Add the encoded target names as columns to the dataframe. df = pd.concat([df, encoded_species], axis=1) return df
def prep_iris_data(splain=local_settings.splain, **kwargs): ''' prep_iris(splain=local_settings.splain, **kwargs) RETURNS: df, encoder Iris Data 1. Use the function defined in acquire.py to load the iris data. 2. Drop the species_id and measurement_id columns. 3. Rename the species_name column to just species. 4. Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful? 5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied. ''' df = get_iris_data(type='sql', splain=splain) df = df.drop(columns='measurement_id', axis=1) df = df.rename(columns={'species_name': 'species'}) df, encoder = encode_col(df=df, col='species') return df, encoder
def prep_iris(cached=True): ''' This function acquires and prepares the iris data from a local csv, default. Passing cached=False acquires fresh data from Codeup db and writes to csv. Returns the iris df with dummy variables encoding species. ''' # use my aquire function to read data into a df from a csv file df = get_iris_data(cached) # drop and rename columns df = df.drop(columns='species_id').rename( columns={'species_name': 'species'}) # create dummy columns for species species_dummies = pd.get_dummies(df.species, drop_first=True) # add dummy columns to df df = pd.concat([df, species_dummies], axis=1) return df
from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_graphviz import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings("ignore") from acquire import get_iris_data from prepare import prep_iris seed = 43 iris = get_iris_data() iris, encoder = prep_iris(iris) X = iris[['sepal_width', 'sepal_length', 'petal_width', 'petal_length']] y = iris[['species']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=seed) logit = LogisticRegression(random_state=seed, solver='saga') logit.fit(X_train, y_train) y_pred = logit.predict(X_train) score = logit.score(X_train, y_train) cm = confusion_matrix(y_train, y_pred) cr = classification_report(y_train, y_pred)
# Using the Iris Data: # # Use the function defined in acquire.py to load the iris data. # # Drop the species_id and measurement_id columns. # # Rename the species_name column to just species. # # Create dummy variables of the species name. # # Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied. # In[3]: # Use the function defined in acquire.py to load the iris data. iris = acquire.get_iris_data() iris.head() # In[4]: # Drop the species_id and measurement_id columns. cols_to_drop = ['species_id', 'measurement_id'] iris = iris.drop(columns=cols_to_drop) # In[5]: iris.columns # In[6]: # Rename the species_name column to just species.
from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns # ignore warnings import warnings warnings.filterwarnings("ignore") from acquire import get_iris_data from prepare import prepare_iris_data df = prepare_iris_data(get_iris_data()) X = df.drop(columns=["species"]) y = df[['species']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123) X_train.head() # Create the Logistic Regression Object logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga') # Fit the model to the training data logit.fit(X_train, y_train) # Print the coefficients and intercept of the model
# %matplotlib inline import math from sklearn.linear_model import LinearRegression from sklearn import metrics import statsmodels.api as sm from pprint import pprint # get data from acquire import get_iris_data df = get_iris_data() # Split data into train (70%) & test (30%) samples. You should end with 2 data frames: train_df and test_df train_df, test_df = train_test_split(df, test_size=.30, random_state=123) # Create a swarmplot where the x-axis is each of the independent variable names (petal_length, petal_width, etc). The y-axis is the value of the variable. # Use color to represent species as another dimension. Hint: You will to 'melt' the dataframe into a 'long' dataframe in order to accomplish this. What are your takeaways from this visualization? # THE VIRGINICA SPECIES HAS LARGER MEASUREMENTS OVERALL EXCEPT FOR SEPAL WIDTH # THE SETOSA SPECIES HAS LOWER MEASUREMENTS OVERALL EXCEPT FOR THE SEPAL WIDTH, WHICH IS THE HIGHEST COMPARED TO OTHER SPECIES. train_df.head() df_melted = pd.melt(train_df, id_vars=['species_name'], value_vars=('sepal_length','sepal_width','petal_length','petal_width'), var_name='measurement in cm') df_melted.head() plt.figure(figsize=(12,12)) sns.swarmplot(x="measurement in cm", y='value', data=df_melted, hue="species_name")
import pandas as pd import numpy as np from acquire import get_iris_data from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import MinMaxScaler df_i = get_iris_data() def prep_iris(df_i): df_i.drop(columns=['species_id', 'measurement_id'], inplace=True) df_i.rename(columns={'species_name': 'species'}, inplace=True) int_encoder = LabelEncoder() int_encoder.fit(df_i.species) df_i.species = int_encoder.transform(df_i.species) species_array = np.array(df_i.species) return df_i prep_iris(df_i)