def main(args): SVM = args.SVM RF = args.RF if SVM: import SVM SVM.SVM(args) if RF: import RF RF.RF(args)
def main(): #prepare for data rawdata = pd.read_csv("mySpotify.csv") newdata = rawdata[[ 'track_id', 'duration_ms', 'popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence', 'parentCat' ]] #96153 ## hypothesis 1: t test : mean of energy between pop and non-pop print("hypothesis 1") target_pop = newdata[newdata.parentCat == "pop"].drop_duplicates( 'track_id') id_in_labels = target_pop['track_id'].values non_pop = newdata[~newdata.track_id.isin(id_in_labels)].drop_duplicates( 'track_id') print( stats.ttest_ind(target_pop["energy"].values, non_pop["energy"].values)) #hypothesis 2: linear regression acousticness~energy+loudness print("hypothesis 2") data_unique = newdata.drop_duplicates('track_id') X = data_unique[["energy", 'loudness']] X = sm.add_constant(X) y = data_unique["acousticness"] # Note the difference in argument order model = sm.OLS(y, X).fit() # Print out the statistics print(model.summary()) ## == groupby data using gernre == #there are 29 genres print("hypothesis 3") genres = newdata["parentCat"].unique() with open("genres_spotify.txt", "w") as f: f.write(str(genres)) print(genres) #record the classification result of diffrent classification methods on all genres precision_train = pd.DataFrame(0, index=genres, columns=[ "DecisisionTree", "KNN", "Naive Bayes", "SVM", "Random Forest" ]) precision_test = pd.DataFrame(0, index=genres, columns=[ "DecisisionTree", "KNN", "Naive Bayes", "SVM", "Random Forest" ]) with open("classification.txt", "w") as f: f.write("classification result \n") for i in np.arange(len(genres)): with open("classification.txt", "a") as f: f.write("\n classification result for " + genres[i] + "\n") #state the variable using string vars()["data_" + genres[i]] = newdata[newdata.parentCat == genres[i]] data_target = vars()["data_" + genres[i]] genre_target = genres[i] if genres[i] in ["party", "workout"]: data_target = data_target.sample(3000) ##fill with col index #features_selection=[] #y_selection=15 #Fisrt use data_Classical as the target data, then we randomly sample song tracks which dosen't #belong to Classical and integrate them into the data set # id_in_labels = data_target['track_id'].values data_out_of_label = newdata[~newdata.track_id. isin(id_in_labels)].drop_duplicates( 'track_id').sample(n=len(data_target)) #normalize dataframe columns #print(data_target.describe()) #print(data_out_of_label.describe()) #combine two data sets into one df = pd.concat([data_out_of_label, data_target]).sort_values(by=["track_id"]) df.parentCat[df.parentCat != genre_target] = 0 df.parentCat[df.parentCat == genre_target] = 1 ##normalize data from sklearn import preprocessing x = df.iloc[:, 1:15] #the last col is not included min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df.iloc[:, 1:15] = x_scaled ##set training and test datasets; df_values = df.values X = np.array(df_values[:, 1:15].tolist()) Y = np.array(df_values[:, 15].tolist()) test_size = 0.30 seed = np.random.randint(1, 4) X_train, X_validate, Y_train, Y_validate = train_test_split( X, Y, test_size=test_size, random_state=seed) print("classification results for " + genres[i]) #### ===== ======# # Decison Tree ### ################## y_score, train_precision, test_precision = Basic_DecisionTree( X_train, Y_train, X_validate, Y_validate, genre_target) fpr_1, tpr_1, thresholds = roc_curve(Y_validate, y_score) roc_auc_1 = auc(fpr_1, tpr_1) precision_train.loc[genres[i], "DecisisionTree"] = train_precision precision_test.loc[genres[i], "DecisisionTree"] = test_precision #plot roc curve # fig = plt.figure() # plt.plot(fpr,tpr,label="ROC curve(area =%0.2f)" %roc_auc) # plt.plot([0,1],[0,1],"k--") # plt.xlim([0.0,1.0]) # plt.ylim([0.0,1.0]) # plt.xlabel("False positive ") # plt.ylabel("true positive") # plt.title("ROC of Decision Tree classifier for"+genre_target+",area=="+str(roc_auc)) # # fig.savefig('ROC DecisionTree'+genre_target+'.png') # plt.close() ####### ===== ======# # KNN ### ##################### y_score, train_precision, test_precision = KNN(X_train, Y_train, X_validate, Y_validate) fpr_2, tpr_2, thresholds = roc_curve(Y_validate, y_score) roc_auc_2 = auc(fpr_2, tpr_2) precision_train.loc[genres[i], "KNN"] = train_precision precision_test.loc[genres[i], "KNN"] = test_precision #### ===== ======# # Naive bayes ### ################## y_score, train_precision, test_precision = NVBayes( X_train, Y_train, X_validate, Y_validate) fpr_3, tpr_3, thresholds = roc_curve(Y_validate, y_score) roc_auc_3 = auc(fpr_3, tpr_3) precision_train.loc[genres[i], "Naive Bayes"] = train_precision precision_test.loc[genres[i], "Naive Bayes"] = test_precision # #### ===== ======# # Svm ### ################## y_score, train_precision, test_precision = mysvm( X_train, Y_train, X_validate, Y_validate) fpr_4, tpr_4, thresholds = roc_curve(Y_validate, y_score) roc_auc_4 = auc(fpr_4, tpr_4) precision_train.loc[genres[i], "SVM"] = train_precision precision_test.loc[genres[i], "SVM"] = test_precision #### ===== ======# # RandomForest ### ################## y_score, train_precision, test_precision = RF(X_train, Y_train, X_validate, Y_validate) fpr_5, tpr_5, thresholds = roc_curve(Y_validate, y_score) roc_auc_5 = auc(fpr_5, tpr_5) precision_train.loc[genres[i], "Random Forest"] = train_precision precision_test.loc[genres[i], "Random Forest"] = test_precision #plot roc curve fig = plt.figure() plt.plot(fpr_1, tpr_1, label="ROC curve for decisionTree(area =%0.2f)" % roc_auc_1, color='darkorange') plt.plot(fpr_2, tpr_2, label="ROC curve for KNN(area =%0.2f)" % roc_auc_2, color='aqua') plt.plot(fpr_3, tpr_3, label="ROC curve for Naive bayes(area =%0.2f)" % roc_auc_3, color='cornflowerblue') plt.plot(fpr_4, tpr_4, label="ROC curve for SVM (area =%0.2f)" % roc_auc_4, color='green') plt.plot(fpr_5, tpr_5, label="ROC curve for RandomForest(area =%0.2f)" % roc_auc_5, color='red') plt.plot([0, 1], [0, 1], "k--") plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel("False positive ") plt.ylabel("true positive") plt.title("ROC of classifiers for" + genre_target) plt.legend(loc="lower right") fig.savefig('ROC ' + genre_target + '.png') plt.close() precision_train.to_csv("precision_train.csv") precision_test.to_csv("precision_test.csv")
import naiv import pandas as pd import json from flask import Flask,jsonify,request,render_template from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA df=prep.cleaning(pd.read_csv("dataset.csv")) #load the dataset lda = LDA(n_components = 8) #select 8 feature X=df["data"].drop("Churn",axis=1) Y=df["data"]["Churn"] lda.fit(X,Y) #fit the data to select the best feature clf_log=LOGR.LR(df["data"].copy(),"Churn",lda) #intiate object from logisticRegression Class clf_KNN=KNN.KNN(df["data"].copy(),"Churn",lda) #intiate object from K-NN Class clf_RF=RF.RF(df["data"].copy(),"Churn",lda) #intiate object from RandomForest Class clf_SVM=SVM.SV(df["data"].copy(),"Churn",lda) #intiate object from RandomForest Class clf_Dt=Dtree.DT(df["data"].copy(),"Churn",lda) #intiate object from RandomForest Class clf_naiv=naiv.RF(df["data"].copy(),"Churn",lda) #intiate object from RandomForest Class app = Flask(__name__) @app.route("/",methods=["GET","POST"]) def hello(): data={"data":pd.read_csv("dataset.csv").head(500).to_json()} return jsonify(data) ############################################################### PreProcessing ################################################### @app.route("/prep",methods=["GET","POST"]) def prp(): df=prep.cleaning(pd.read_csv("dataset.csv"))["data"]