def predict_music(user): """ predicts the music based on the user choice :parameter user: (dic) Contains the user response for the songs :return selected_music: (int) Index of the recommended music based on user response using SVM """ connection = MongoClient(uri) db = connection.get_default_database() pos_feature = [] for p_index in user['pos_music']: p_music = db.music_data.find_one({"_id": ObjectId(p_index)}) pos_feature.append(np.asarray(p_music['stFeatures'])) neg_feature = [] for n_index in user['neg_music']: n_music = db.music_data.find_one({"_id": ObjectId(n_index)}) neg_feature.append(np.asarray(n_music['stFeatures'])) features = [(np.hstack(pos_feature)).T, (np.hstack(neg_feature)).T] model = att.trainSVM_RBF(features, 0.1) selected_music = user["av_music"][0] selected_value = 0 for i in user["av_music"]: s_music = db.music_data.find_one({"_id": ObjectId(i)}) feature = np.asarray(s_music['stFeatures']) predict = [0, 0] for j in range(feature.shape[1]): predict[int(model.predict(feature[:, j].reshape(1, -1)))] += 1 if (predict[0] / (predict[0] + predict[1])) > selected_value: selected_value = (predict[0] / (predict[0] + predict[1])) selected_music = i connection.close() return selected_music
def Classify(clf, featuresAll, bestParam): if clf == 'svm': model = aT.trainSVM(featuresAll, bestParam) elif clf == 'svm_rbf': model = aT.trainSVM_RBF(featuresAll, bestParam) elif clf == 'extratrees': model = aT.trainExtraTrees(featuresAll, bestParam) elif clf == 'randomforest': model = aT.trainRandomForest(featuresAll, bestParam) elif clf == 'knn': model = aT.trainKNN(featuresAll, bestParam) elif clf == 'gradientboosting': model = aT.trainGradientBoosting(featuresAll, bestParam) return model
def getTrainClassifier(f_train,classifier_name,param): if classifier_name == AudioClassifierManager.__svmModelName: classifier = aT.trainSVM(f_train, param) elif classifier_name == AudioClassifierManager.__svmRbfModelName: classifier = aT.trainSVM_RBF(f_train, param) elif classifier_name == AudioClassifierManager.__knnModelName: classifier = aT.trainKNN(f_train, param) elif classifier_name == AudioClassifierManager.__randomforestModelName: classifier = aT.trainRandomForest(f_train, param) elif classifier_name == AudioClassifierManager.__gradientboostingModelName: classifier = aT.trainGradientBoosting(f_train, param) elif classifier_name == AudioClassifierManager.__extratreesModelName: classifier = aT.trainExtraTrees(f_train, param) else: classifier = None return classifier
def evaluateclassifier(features, class_names, n_exp, classifier_name, Params, parameterMode, perTrain=0.90): ''' ARGUMENTS: features: a list ([numOfClasses x 1]) whose elements containt numpy matrices of features. each matrix features[i] of class i is [n_samples x numOfDimensions] class_names: list of class names (strings) n_exp: number of cross-validation experiments classifier_name: svm or knn or randomforest Params: list of classifier parameters (for parameter tuning during cross-validation) parameterMode: 0: choose parameters that lead to maximum overall classification ACCURACY 1: choose parameters that lead to maximum overall f1 MEASURE RETURNS: bestParam: the value of the input parameter that optimizes the selected performance measure ''' # feature normalization: (features_norm, MEAN, STD) = normalizeFeatures(features) #features_norm = features; n_classes = len(features) ac_all = [] f1_all = [] precision_classes_all = [] recall_classes_all = [] f1_classes_all = [] cms_all = [] # compute total number of samples: n_samples_total = 0 for f in features: n_samples_total += f.shape[0] if n_samples_total > 1000 and n_exp > 50: n_exp = 50 print("Number of training experiments changed to 50 due to high number of samples") if n_samples_total > 2000 and n_exp > 10: n_exp = 10 print("Number of training experiments changed to 10 due to high number of samples") for Ci, C in enumerate(Params): # for each param value cm = numpy.zeros((n_classes, n_classes)) for e in range(n_exp): # for each cross-validation iteration: print("Param = {0:.5f} - classifier Evaluation " "Experiment {1:d} of {2:d}".format(C, e+1, n_exp)) # split features: f_train, f_test = randSplitFeatures(features_norm, perTrain) # train multi-class svms: if classifier_name == "svm": classifier = trainSVM(f_train, C) elif classifier_name == "svm_rbf": classifier = trainSVM_RBF(f_train, C) elif classifier_name == "knn": classifier = trainKNN(f_train, C) elif classifier_name == "randomforest": classifier = trainRandomForest(f_train, C) elif classifier_name == "gradientboosting": classifier = trainGradientBoosting(f_train, C) elif classifier_name == "extratrees": classifier = trainExtraTrees(f_train, C) elif classifier_name == "logisticregression": classifier = trainLogisticRegression(f_train, C) cmt = numpy.zeros((n_classes, n_classes)) for c1 in range(n_classes): n_test_samples = len(f_test[c1]) res = numpy.zeros((n_test_samples, 1)) for ss in range(n_test_samples): [res[ss], _] = classifierWrapperHead(classifier, classifier_name, f_test[c1][ss]) for c2 in range(n_classes): cmt[c1][c2] = float(len(numpy.nonzero(res == c2)[0])) cm = cm + cmt cm = cm + 0.0000000010 rec = numpy.zeros((cm.shape[0], )) pre = numpy.zeros((cm.shape[0], )) for ci in range(cm.shape[0]): rec[ci] = cm[ci, ci] / numpy.sum(cm[ci, :]) pre[ci] = cm[ci, ci] / numpy.sum(cm[:, ci]) precision_classes_all.append(pre) recall_classes_all.append(rec) f1 = 2 * rec * pre / (rec + pre) f1_classes_all.append(f1) ac_all.append(numpy.sum(numpy.diagonal(cm)) / numpy.sum(cm)) cms_all.append(cm) f1_all.append(numpy.mean(f1)) print("\t\t", end="") for i, c in enumerate(class_names): if i == len(class_names)-1: print("{0:s}\t\t".format(c), end="") else: print("{0:s}\t\t\t".format(c), end="") print("OVERALL") print("\tC", end="") for c in class_names: print("\tPRE\tREC\tf1", end="") print("\t{0:s}\t{1:s}".format("ACC", "f1")) best_ac_ind = numpy.argmax(ac_all) best_f1_ind = numpy.argmax(f1_all) for i in range(len(precision_classes_all)): print("\t{0:.3f}".format(Params[i]), end="") for c in range(len(precision_classes_all[i])): print("\t{0:.1f}\t{1:.1f}\t{2:.1f}".format(100.0 * precision_classes_all[i][c], 100.0 * recall_classes_all[i][c], 100.0 * f1_classes_all[i][c]), end="") print("\t{0:.1f}\t{1:.1f}".format(100.0 * ac_all[i], 100.0 * f1_all[i]), end="") if i == best_f1_ind: print("\t best f1", end="") if i == best_ac_ind: print("\t best Acc", end="") print("") if parameterMode == 0: # keep parameters that maximize overall classification accuracy: print("Confusion Matrix:") printConfusionMatrix(cms_all[best_ac_ind], class_names) return Params[best_ac_ind] elif parameterMode == 1: # keep parameters that maximize overall f1 measure: print("Confusion Matrix:") printConfusionMatrix(cms_all[best_f1_ind], class_names) return Params[best_f1_ind]
def featureAndTrain(list_of_dirs, mt_win, mt_step, st_win, st_step, classifier_type, model_name, compute_beat=False, perTrain=0.90, feats=["gfcc", "mfcc"]): ''' This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: list_of_dirs: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mt_win, mt_step: mid-term window length and step st_win, st_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. ''' # STEP A: Feature Extraction: [features, classNames, _] = aF.dirsWavFeatureExtraction(list_of_dirs, mt_win, mt_step, st_win, st_step, compute_beat=compute_beat, feats=feats) if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] writeTrainDataToARFF(model_name, features, classNames, feature_names) for i, f in enumerate(features): if len(f) == 0: print("trainSVM_feature ERROR: " + list_of_dirs[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = numpy.array([10, 25, 50, 100,200,500]) elif classifier_type == "knn": classifier_par = numpy.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = numpy.array([10, 25, 50, 100,200,500]) elif classifier_type == "extratrees": classifier_par = numpy.array([10, 25, 50, 100,200,500]) elif classifier_type == "logisticregression": classifier_par = numpy.array([0.01, 0.1, 1, 5]) # get optimal classifeir parameter: features2 = [] for f in features: fTemp = [] for i in range(f.shape[0]): temp = f[i,:] if (not numpy.isnan(temp).any()) and (not numpy.isinf(temp).any()) : fTemp.append(temp.tolist()) else: print("NaN Found! Feature vector not used for training") features2.append(numpy.array(fTemp)) features = features2 bestParam = evaluateclassifier(features, classNames, 300, classifier_type, classifier_par, 0, perTrain) # Hier!!!! print("Selected params: {0:.5f}".format(bestParam)) C = len(classNames) [features_norm, MEAN, STD] = normalizeFeatures(features) # normalize features MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = features_norm # STEP C: Save the classifier to file if classifier_type == "svm": classifier = trainSVM(featuresNew, bestParam) elif classifier_type == "svm_rbf": classifier = trainSVM_RBF(featuresNew, bestParam) elif classifier_type == "randomforest": classifier = trainRandomForest(featuresNew, bestParam) elif classifier_type == "gradientboosting": classifier = trainGradientBoosting(featuresNew, bestParam) elif classifier_type == "extratrees": classifier = trainExtraTrees(featuresNew, bestParam) elif classifier_type == "logisticregression": classifier = trainLogisticRegression(featuresNew, bestParam) if classifier_type == "knn": [X, Y] = listOfFeatures2Matrix(featuresNew) X = X.tolist() Y = Y.tolist() fo = open(model_name, "wb") cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(Y, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(bestParam, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees" or \ classifier_type == "logisticregression": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) fo = open(model_name + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close()
def main(rootName, modelType, classifierParam, signal_type): CMall = numpy.zeros((2, 2)) if modelType != "svm" and modelType != "svm_rbf": C = [int(classifierParam)] else: C = [(classifierParam)] F1s = [] Accs = [] for ifold in range(0, 10): # for each fold dirName = rootName + os.sep + "fold_{0:d}".format( ifold) # get fold path name classNamesTrain, featuresTrain = dirFeatureExtraction([ os.path.join(dirName, "train", "fail"), os.path.join(dirName, "train", "success") ], signal_type) # TRAINING data feature extraction bestParam = aT.evaluateClassifier( featuresTrain, classNamesTrain, 2, modelType, C, 0, 0.90) # internal cross-validation (for param selection) classNamesTest, featuresTest = dirFeatureExtraction([ os.path.join(dirName, "test", "fail"), os.path.join(dirName, "test", "success") ], signal_type) # trainGradientBoosting data feature extraction [featuresTrainNew, MEAN, STD] = aT.normalizeFeatures( featuresTrain) # training features NORMALIZATION if modelType == "svm": # classifier training Classifier = aT.trainSVM(featuresTrainNew, bestParam) elif modelType == "svm_rbf": Classifier = aT.trainSVM_RBF(featuresTrainNew, bestParam) elif modelType == "randomforest": Classifier = aT.trainRandomForest(featuresTrainNew, bestParam) elif modelType == "gradientboosting": Classifier = aT.trainGradientBoosting(featuresTrainNew, bestParam) elif modelType == "extratrees": Classifier = aT.trainExtraTrees(featuresTrainNew, bestParam) CM = numpy.zeros((2, 2)) # evaluation on testing data for iC, f in enumerate(featuresTest): # for each class for i in range( f.shape[0]): # for each testing sample (feature vector) curF = f[i, :] # get feature vector curF = (curF - MEAN) / STD # normalize test feature vector winnerClass = classNamesTrain[int( aT.classifierWrapper( Classifier, modelType, curF)[0])] # classify and get winner class trueClass = classNamesTest[iC] # get groundtruth class CM[classNamesTrain.index(trueClass)][classNamesTrain.index( winnerClass)] += 1 # update confusion matrix CMall += CM # update overall confusion matrix Recall, Precision, F1 = computePreRec( CM, classNamesTrain) # get recall, precision and F1 (per class) Acc = numpy.diagonal(CM).sum() / CM.sum() # get overall accuracy F1s.append(numpy.mean(F1)) # append average F1 Accs.append(Acc) # append clasification accuracy print print "FINAL RESULTS" print print "----------------------------------" print "fold\tacc\tf1" print "----------------------------------" for i in range(len(F1s)): print "{0:d}\t{1:.1f}\t{2:.1f}".format(i, 100 * Accs[i], 100 * F1s[i]) Acc = numpy.diagonal(CMall).sum() / CMall.sum() Recall, Precision, F1 = computePreRec(CMall, classNamesTrain) print "----------------------------------" print "{0:s}\t{1:.1f}\t{2:.1f}".format("Avg", 100 * numpy.mean(Accs), 100 * numpy.mean(F1s)) print "{0:s}\t{1:.1f}\t{2:.1f}".format("Av CM", 100 * Acc, 100 * numpy.mean(F1)) print "----------------------------------" print print "Overal Confusion matrix:" aT.printConfusionMatrix(CMall, classNamesTrain) print print "FAIL Recall = {0:.1f}".format(100 * Recall[classNamesTrain.index("fail")]) print "FAIL Precision = {0:.1f}".format( 100 * Precision[classNamesTrain.index("fail")]) print "SUCCESS Recall = {0:.1f}".format( 100 * Recall[classNamesTrain.index("success")]) print "SUCCESS Precision = {0:.1f}".format( 100 * Precision[classNamesTrain.index("success")]) return CMall, Acc, Recall, Precision, F1