exploreData(data) # Success - Display the first record if data is not None: display(data.head(n=1)) print data.describe(include='all') drop_col = ['X', 'Y', 'rain', 'area'] features_raw = data.drop(drop_col, axis=1) target_raw = data['area'] if features_raw is not None: display(features_raw.head(n=1)) #transform data from projectFunctions import transformData features, target, target_reg = transformData(features_raw, target_raw) # ##shuffle and split the data to create train and test datasets from projectFunctions import splitData X_train, X_test, y_train, y_test = splitData(features, target, 0.3) Xr_train, Xr_test, yr_train, yr_test = splitData(features, target_reg, 0.3) # from projectFunctions import decTree, drawTree, kneighbors, decTreeReg, kneighbhorsReg sample_size = len(X_train) feature_cols = features.columns #Usin gini and depth = 3 results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'entropy', 4) drawTree(learner, feature_cols, 'fire_dt.png')
t = [] data['PlayerLine'].apply(lambda x: t.append(x)) corpus = ' '.join(t) stop_w = set(stopwords.words('english')) tokens = word_tokenize(corpus) sen = [w for w in tokens if not w in stop_w] corpus = [w for w in sen if w.isalpha()] fdist=FreqDist(corpus) #tokenize the strig #Compute the frequency of words in a sentence data['PlayerLine'] = data['PlayerLine'].apply(lambda x: tokenString(x,fdist,stop_w)) features, target = exploreData(data) features_final, target_final = transformData(features, target) #Split the data with test size = 30 from projectFunctions import splitData,svmClassifier,decTree,naiveBayes X_train, X_test, y_train, y_test = splitData(features_final, target_final, 0.3) #results,learner = svmClassi fier(X_train, X_test, y_train, y_test) #print "Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']) #print "Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test']) #print "-----------------------------------------------------------------------" results,learner = decTree(X_train, y_train, X_test, y_test, 'gini', 13) # print "Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test'])
tp['team'] = dicts.keys() tp['points'] = dicts.values() from projectFunctions import barPlot, numCount, corrPlot, splitData #barPlot(tp['team'], tp['points'],'Teams','Scores','Points by team') #numCount(data,'score1','score2','Score distribution') #numCount(data,'elo1','elo2','elo distribution') #Remove categorical columns for correlation heatmap data_corr = data.drop(['team1', 'team2'], axis=1) corr = data_corr.corr() #corrPlot(corr) features, target = exploreData(data) features, target = transformData(features, target) X_train, X_test, y_train, y_test = splitData(features, target, 0.3) from projectFunctions import lineReg, sdgReg, ridgeReg, lassoReg res_pd = pd.DataFrame( [], columns=['Model', 'AccTrain', 'AccTest', 'TrainTime', 'PredTime']) results, clf_fit_train = lineReg(X_train, X_test, y_train, y_test) print "-----------------------------------------------------------------------" print "Times for Training, Prediction: %.5f, %.5f" % (results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" % (results['acc_train'], results['acc_test'])
as_index=False)['rating'].apply(lambda x: x.value_counts().index[0]) #data_r.to_csv('test.csv',index=False) path = r'C:\Users\pmspr\Documents\HS\MS\Sem 3\EECS 731\Week 5\HW\Git\EECS-731-Project-3\Data' filename = "tags.csv" data_t = loadData(path, filename) data_t = data_t.drop(['userId', 'timestamp'], axis=1) data_t['tag'] = data_t['tag'].apply(lambda x: sentimentPolarity(x)) data_t = data_t.groupby(['movieId'], as_index=False).count() #data_t.to_csv('test.csv',index=False) data2 = pd.merge(data_r, data_t, on=['movieId'], how='inner') data = pd.merge(d1, data2, on=['movieId'], how='inner') data.to_csv('test.csv', index=False) drop_col = ['title'] data = data.drop(drop_col, axis=1) print( "----------------------Shakespear Play data-----------------------------") features, target = exploreData(data) misVal, mis_val_table_ren_columns = missingValues(data) from projectFunctions import splitData, kmeans, transformData data_tran = transformData(data) X_train, X_test = splitData(data_tran, 0.3) result, scr = kmeans(X_train, X_test) print(result) print(scr) #data.to_csv('test.csv',index=False)
y='count', hue='fake', data=df, ax=ax[1][fl - 2]) st = "Feature set " + str(fl) ax[1][fl - 2].set_title(st, fontsize=14) plt.sca(ax[1][fl - 2]) plt.xticks(rotation=90) plt.suptitle("Feature distribution") plt.show() sns.lineplot(data=dat, x="year", y="passengers", hue="month") ### data_raw = data features, target = transformData(data_raw) #des = features.describe().transpose().to_csv('test.csv') X_train, X_test, y_train, y_test = splitData(features, target, 0.3) from projectFunctions import multinomialnb, svmClassifier, randomForest, pca, gclus #results,learner = multinomialnb(X_train, X_test, y_train, y_test) # #print ("Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time'])) #print ("Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test'])) #print ("-----------------------------------------------------------------------") # #results,learner = svmClassifier(X_train, X_test, y_train, y_test) # #print ("Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']))