input2 = pickle.load(open("msd_train_t2.pkl", "rb")) input3 = pickle.load(open("msd_train_t3.pkl", "rb")) input4 = pickle.load(open("msd_train_t4.pkl", "rb")) input5 = pickle.load(open("msd_train_t5.pkl", "rb")) # print input1.shape[0] # input = pickle.load(open("msd_train.pkl", "rb")) maxval1 = crop_rock.find_second_max_value(input1) maxval2 = crop_rock.find_second_max_value(input2) maxval3 = crop_rock.find_second_max_value(input3) maxval4 = crop_rock.find_second_max_value(input4) maxval5 = crop_rock.find_second_max_value(input5) # print maxval1 # maxval = crop_rock.find_second_max_value(input) filtered1 = crop_rock.drop_excess_rows(input1, maxval1) filtered2 = crop_rock.drop_excess_rows(input2, maxval2) filtered3 = crop_rock.drop_excess_rows(input3, maxval3) filtered4 = crop_rock.drop_excess_rows(input4, maxval4) filtered5 = crop_rock.drop_excess_rows(input5, maxval5) # print filtered1.shape[0] # filtered = crop_rock.drop_excess_rows(input, maxval) #handling missing data filtered1 = filtered1[filtered1['Genre']!='UNCAT']; filtered1 = filtered1.dropna() filtered2 = filtered2[filtered2['Genre']!='UNCAT']; filtered2 = filtered2.dropna() filtered3 = filtered3[filtered3['Genre']!='UNCAT']; filtered3 = filtered3.dropna() filtered4 = filtered4[filtered4['Genre']!='UNCAT']; filtered4 = filtered4.dropna() filtered5 = filtered5[filtered5['Genre']!='UNCAT']; filtered5 = filtered5.dropna() # print filtered1 # filtered = filtered[filtered['Genre']!='UNCAT']; filtered.dropna()
#Get rid of the rows that have missing values (nan) and UNCAT df_full = df_full[ df_full["Genre"] != "UNCAT" ] df_full = df_full.dropna() y_full = df_full["Genre"] X_full = df_full.drop(["Genre", "Track ID", "Year"], axis=1) #Split the 80% of data to 70% Training and 30% Validation Data from sklearn.cross_validation import train_test_split X_train, X_validation, y_train, y_validation = \ train_test_split(X_full, y_full, train_size=0.7, random_state=42) print "DEBUG: Data splitted" df_train_toCrop = pd.concat([y_train, X_train], axis=1, join='inner') #Crop the dataset maxval = crop_rock.find_second_max_value(df_train_toCrop) df_cropped = crop_rock.drop_excess_rows(df_train_toCrop, maxval) y_cropped = df_cropped["Genre"] X_cropped = df_cropped.drop(["Genre"], axis=1) # # Start LDA Classification # print "Performing LDA Classification:" # from sklearn.lda import LDA # clf = LDA(solver='svd', shrinkage=None, n_components=None).fit(X_cropped, np.ravel(y_cropped[:])) # # #Use X_cropped to get best model # y_train_predicted = clf.predict(X_train) # print "Error rate for LDA on Training: ", ml_aux.get_error_rate(y_train,y_train_predicted) # # ml_aux.plot_confusion_matrix(y_cropped, predicted, "CM on LDA cropped") # # plt.show() # # y_validation_predicted = clf.predict(X_validation)