#tokenize the strig #Compute the frequency of words in a sentence data['PlayerLine'] = data['PlayerLine'].apply(lambda x: tokenString(x,fdist,stop_w)) features, target = exploreData(data) features_final, target_final = transformData(features, target) #Split the data with test size = 30 from projectFunctions import splitData,svmClassifier,decTree,naiveBayes X_train, X_test, y_train, y_test = splitData(features_final, target_final, 0.3) #results,learner = svmClassi fier(X_train, X_test, y_train, y_test) #print "Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']) #print "Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test']) #print "-----------------------------------------------------------------------" results,learner = decTree(X_train, y_train, X_test, y_test, 'gini', 13) # print "Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------" results,learner = naiveBayes(X_train, y_train, X_test, y_test) print "Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------" #data.to_csv('test.csv',index=False)
#transform data from projectFunctions import transformData features, target, target_reg = transformData(features_raw, target_raw) # ##shuffle and split the data to create train and test datasets from projectFunctions import splitData X_train, X_test, y_train, y_test = splitData(features, target, 0.3) Xr_train, Xr_test, yr_train, yr_test = splitData(features, target_reg, 0.3) # from projectFunctions import decTree, drawTree, kneighbors, decTreeReg, kneighbhorsReg sample_size = len(X_train) feature_cols = features.columns #Usin gini and depth = 3 results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'entropy', 4) drawTree(learner, feature_cols, 'fire_dt.png') print "Accuracy for Decision tree Classifier - Training, Test sets: %.5f, %.5f" % ( results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------" #decision tree regression results_dreg, learner_dreg = decTreeReg(Xr_train, yr_train, Xr_test, yr_test, 'entropy', 4) print "R2 score for Decision tree regression -Training, Test sets: %.5f, %.5f" % ( results_dreg['acc_train'], results_dreg['acc_test']) print "-----------------------------------------------------------------------" #kneighbors classifier resultsK = kneighbors(X_train, y_train, X_test, y_test) print "Accuracy for K-Neighbors Classifier-Training, Test sets: %.5f, %.5f" % (
# Success - Display the first record if data is not None: data.columns = col_names #display(data.head(n=1)) #explore the data from projectFunctions import exploreData exploreData(data) drop_col = ['skin', 'label'] features = data.drop(drop_col, axis=1) target = data['label'] #if features is not None: #display(features.head(n=1)) # #shuffle and split the data to create train and test datasets from projectFunctions import splitData X_train, X_test, y_train, y_test = splitData(features, target, 0.3) from projectFunctions import decTree, drawTree sample_size = len(X_train) feature_cols = features.columns results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'gini', 3) drawTree(learner, feature_cols, 'diabetes.png') print "Times for Training, Prediction: %.5f, %.5f" % (results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" % (results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------"