Ejemplo n.º 1
0
exploreData(data)

# Success - Display the first record
if data is not None:
    display(data.head(n=1))
    print data.describe(include='all')

drop_col = ['X', 'Y', 'rain', 'area']
features_raw = data.drop(drop_col, axis=1)
target_raw = data['area']
if features_raw is not None:
    display(features_raw.head(n=1))

#transform data
from projectFunctions import transformData
features, target, target_reg = transformData(features_raw, target_raw)

#
##shuffle and split the data to create train and test datasets
from projectFunctions import splitData
X_train, X_test, y_train, y_test = splitData(features, target, 0.3)
Xr_train, Xr_test, yr_train, yr_test = splitData(features, target_reg, 0.3)
#
from projectFunctions import decTree, drawTree, kneighbors, decTreeReg, kneighbhorsReg
sample_size = len(X_train)
feature_cols = features.columns

#Usin gini and depth = 3
results, learner = decTree(sample_size, X_train, y_train, X_test, y_test,
                           'entropy', 4)
drawTree(learner, feature_cols, 'fire_dt.png')
Ejemplo n.º 2
0
t = []
data['PlayerLine'].apply(lambda x: t.append(x))
corpus = ' '.join(t)
stop_w = set(stopwords.words('english'))
tokens = word_tokenize(corpus)
sen = [w for w in tokens if not w in stop_w]
corpus = [w for w in sen if w.isalpha()]
fdist=FreqDist(corpus)

#tokenize the strig
#Compute the frequency of words in a sentence 
data['PlayerLine'] = data['PlayerLine'].apply(lambda x: tokenString(x,fdist,stop_w))

features, target = exploreData(data)
features_final, target_final = transformData(features, target)

#Split the data with test size = 30
from projectFunctions import splitData,svmClassifier,decTree,naiveBayes
X_train, X_test, y_train, y_test = splitData(features_final, target_final, 0.3)

#results,learner = svmClassi    fier(X_train, X_test, y_train, y_test)

#print "Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time'])     
#print "Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test'])     
#print "-----------------------------------------------------------------------"

results,learner = decTree(X_train, y_train, X_test, y_test, 'gini', 13)
# 
print "Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time'])     
print "Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test'])     
Ejemplo n.º 3
0
tp['team'] = dicts.keys()
tp['points'] = dicts.values()

from projectFunctions import barPlot, numCount, corrPlot, splitData
#barPlot(tp['team'], tp['points'],'Teams','Scores','Points by team')
#numCount(data,'score1','score2','Score distribution')
#numCount(data,'elo1','elo2','elo distribution')

#Remove categorical columns for correlation heatmap
data_corr = data.drop(['team1', 'team2'], axis=1)
corr = data_corr.corr()
#corrPlot(corr)

features, target = exploreData(data)
features, target = transformData(features, target)

X_train, X_test, y_train, y_test = splitData(features, target, 0.3)

from projectFunctions import lineReg, sdgReg, ridgeReg, lassoReg

res_pd = pd.DataFrame(
    [], columns=['Model', 'AccTrain', 'AccTest', 'TrainTime', 'PredTime'])

results, clf_fit_train = lineReg(X_train, X_test, y_train, y_test)

print "-----------------------------------------------------------------------"
print "Times for Training, Prediction: %.5f, %.5f" % (results['train_time'],
                                                      results['pred_time'])
print "Accuracy for Training, Test sets: %.5f, %.5f" % (results['acc_train'],
                                                        results['acc_test'])
Ejemplo n.º 4
0
    as_index=False)['rating'].apply(lambda x: x.value_counts().index[0])
#data_r.to_csv('test.csv',index=False)

path = r'C:\Users\pmspr\Documents\HS\MS\Sem 3\EECS 731\Week 5\HW\Git\EECS-731-Project-3\Data'
filename = "tags.csv"
data_t = loadData(path, filename)
data_t = data_t.drop(['userId', 'timestamp'], axis=1)
data_t['tag'] = data_t['tag'].apply(lambda x: sentimentPolarity(x))
data_t = data_t.groupby(['movieId'], as_index=False).count()
#data_t.to_csv('test.csv',index=False)

data2 = pd.merge(data_r, data_t, on=['movieId'], how='inner')
data = pd.merge(d1, data2, on=['movieId'], how='inner')
data.to_csv('test.csv', index=False)

drop_col = ['title']
data = data.drop(drop_col, axis=1)

print(
    "----------------------Shakespear Play data-----------------------------")
features, target = exploreData(data)
misVal, mis_val_table_ren_columns = missingValues(data)

from projectFunctions import splitData, kmeans, transformData
data_tran = transformData(data)
X_train, X_test = splitData(data_tran, 0.3)
result, scr = kmeans(X_train, X_test)
print(result)
print(scr)

#data.to_csv('test.csv',index=False)
Ejemplo n.º 5
0
                        y='count',
                        hue='fake',
                        data=df,
                        ax=ax[1][fl - 2])
        st = "Feature set " + str(fl)
        ax[1][fl - 2].set_title(st, fontsize=14)
        plt.sca(ax[1][fl - 2])
        plt.xticks(rotation=90)
plt.suptitle("Feature distribution")
plt.show()

sns.lineplot(data=dat, x="year", y="passengers", hue="month")
###

data_raw = data
features, target = transformData(data_raw)
#des = features.describe().transpose().to_csv('test.csv')

X_train, X_test, y_train, y_test = splitData(features, target, 0.3)

from projectFunctions import multinomialnb, svmClassifier, randomForest, pca, gclus

#results,learner = multinomialnb(X_train, X_test, y_train, y_test)
#
#print ("Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']))
#print ("Accuracy for Training, Test sets: %.5f, %.5f" %(results['acc_train'], results['acc_test']))
#print ("-----------------------------------------------------------------------")
#
#results,learner = svmClassifier(X_train, X_test, y_train, y_test)
#
#print ("Times for Training, Prediction: %.5f, %.5f" %(results['train_time'], results['pred_time']))