import numpy as np from matplotlib import pyplot as plt print "-----------------------------------------------------------------------" print('The scikit-learn version is {}.'.format(sklearn.__version__)) #get the working directory and filename path = r'C:\Users\pmspr\Documents\HS\MS\Sem 2\EECS 738\Lab\2\Work\Code\Data' #load data using load class and print describe of data from projectFunctions import loadData filename = "forestfires.csv" data = loadData(path, filename) ##explore the data from projectFunctions import exploreData exploreData(data) # Success - Display the first record if data is not None: display(data.head(n=1)) print data.describe(include='all') drop_col = ['X', 'Y', 'rain', 'area'] features_raw = data.drop(drop_col, axis=1) target_raw = data['area'] if features_raw is not None: display(features_raw.head(n=1)) #transform data from projectFunctions import transformData features, target, target_reg = transformData(features_raw, target_raw)
print ("-----------------------------------------------------------------------") print('The scikit-learn version is {}.'.format(sklearn.__version__)) #load functions from from projectFunctions import loadData, exploreData, missingValues, tokenString, transformData path = r'C:\Users\pmspr\Documents\HS\MS\Sem 3\EECS 731\Week 4\HW\Git\EECS-731-Project-2\Data' filename = "Shakespeare_data.csv" data = loadData(path,filename) drop_col = ['Dataline','PlayerLinenumber','ActSceneLine'] data = data.drop(drop_col, axis = 1) data.rename(columns={'Player':'target'},inplace=True) print(data.columns) print ("----------------------Shakespear Play data-----------------------------") features, target = exploreData(data) misVal, mis_val_table_ren_columns = missingValues(data) # Print some summary information print ("Columns that have missing values:" + str(misVal.shape[0])) print ("-----------------------------------------------------------------------") print(mis_val_table_ren_columns.head(20)) #Remove rows with missing target values ind = data[data['target'].isnull()].index.tolist() data = data.drop(index=ind, axis=0) #Compute features to add value line_count = data.groupby(['Play','target'], as_index=False).count() line_count.rename(columns={'PlayerLine':'LineCount'},inplace=True)