def do_one_cv_classify_predeffolds_valid(theinput): c = theinput[0] gamma = theinput[1] nf = theinput[2] output = theinput[3] input = theinput[4] output_valid = theinput[5] input_valid = theinput[6] useprob = theinput[7] fold_start = theinput[8] fold_start_valid = theinput[9] perfmetric = theinput[10] param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob))) prob = svm.svm_problem(output, input) fold_start_p = (c_int *len(fold_start))() for i in xrange(len(fold_start)): fold_start_p[i] = fold_start[i] prob_valid = svm.svm_problem(output_valid, input_valid) fold_start_p_valid = (c_int *len(fold_start_valid))() for i in xrange(len(fold_start_valid)): fold_start_p_valid[i] = fold_start_valid[i] target = (c_double * prob_valid.l)() posclass = output[0] # print prob libsvm.svm_cross_validation_sepsets(prob, prob_valid,fold_start_p, fold_start_p_valid,param, nf, target) ys = prob.y[:prob_valid.l] db = array([[ys[i],target[i]] for i in range(prob_valid.l)]) # print db del target del fold_start_p del fold_start_p_valid neg = len([x for x in ys if x != posclass]) # print neg pos = prob_valid.l-neg # print pos # print fb,neg,pos,posclass,perfmetric [topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posclass,perfmetric) return topacc,topphi,minfpfnratio,topf1,auc,optbias
def train(self,labels,data): ''' Train the classifier. @param labels: A list of class labels. @param data: A 2D array or list of feature vectors. One feature vector per row. ''' # Check the types and convert to np arrays if isinstance(data,list) or isinstance(data,tuple): data = np.array(data,dtype=np.double) labels = np.array(labels,dtype=np.double) # Preprocess the data labels,data = self._preprocessor.train(labels,data) labels,data = self._label_scale.train(labels,data) # Create the svm parameter data and problem description param = svm.svm_parameter(svm_type=svm.EPSILON_SVR,kernel_type = svm.RBF, p = self._epsilon, gamma=self._gamma) prob = svm.svm_problem(labels.tolist(),data.tolist()) # train the svm self._model = svm.svm_model(prob, param)
def do_one_cv_classify_predeffolds_multi(theinput): c = theinput[0] gamma = theinput[1] nf = theinput[2] output = theinput[3] input = theinput[4] useprob = theinput[5] fold_start = theinput[6] param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob))) prob = svm.svm_problem(output, input) target = (c_double * prob.l)() posclass = output[0] fold_start_p = (c_int *len(fold_start))() for i in xrange(len(fold_start)): fold_start_p[i] = fold_start[i] libsvm.svm_cross_validation_labeltargets(prob, fold_start_p,param, nf, target) acc = len([i for i in xrange(len(output)) if output[i] == target[i]])*1.0/prob.l del target del fold_start_p return acc
def build_problem(img_kind, subdir = "data/"): subdir = "data/" classes = [] data = [] the_ones = glob.glob(subdir + "f_" + img_kind + "*.jpg") all_of_them = glob.glob(subdir + "f_*_*.jpg") the_others = [] for x in all_of_them: if the_ones.count(x) < 1: the_others.append(x) for x in the_ones: classes.append(1) data.append(get_image_features(cv.LoadImageM(x), True, img_kind)) for x in the_others: classes.append(-1) data.append(get_image_features(cv.LoadImageM(x), True, img_kind)) prob = svm.svm_problem(classes, data) return prob
def train(self, c, g, probability=True, compensation=True, path=None, filename=None, save=True): if filename is None: filename = os.path.splitext(self.getOption('strArffFileName'))[0] filename += '.model' if path is None: path = self.dctEnvPaths['data'] param = svm.svm_parameter(kernel_type=svm.RBF, C=c, gamma=g, probability=1 if probability else 0) labels, samples = self.getData(normalize=True) # because we train the SVM with dict we need to redefine the zero-insert self.hasZeroInsert = False if not self.oClassifier is None: self.oClassifier.setOption('hasZeroInsert', True) if compensation: weight, weight_label = self._calculateCompensation(labels) param.weight = weight param.weight_label = weight_label param.nr_weight = len(weight) problem = svm.svm_problem(labels, samples) model = svm.svm_model(problem, param) if save: model.save(os.path.join(path, filename)) return problem, model
def generate_model(self, variant_name, models_folder): training_file = variant_name + ".t" if self.feature_scaling: self.scale_features(variant_name, models_folder) training_file += ".scale" (y, x) = svm_read_problem(training_file) self.m_prob = svm.svm_problem(y, x, self.m_params.kernel_type == PRECOMPUTED) libsvm_path = os.environ['LIBSVM_PATH'] scaled_filename = os.path.abspath(training_file) cp = "python grid.py " + scaled_filename curdir = os.getcwd() os.chdir(libsvm_path + "/tools/") result = call_process(cp) os.chdir(curdir) C,g,rate = [float(l) for l in result.split("\n")[-2].split(" ")] print "C: %.8f, gamma: %.8f\n" % (C,g) self.m_params.C = C self.m_params.gamma = g print "\n-----------------------------" model = svm.svm_train(self.m_prob, self.m_params) print "-----------------------------\n" svm_save_model(models_folder + variant_name + ".model", model)
def do_one_cv_classify(theinput): c = theinput[0] gamma = theinput[1] nf = theinput[2] output = theinput[3] input = theinput[4] useprob = theinput[5] perfmetric = theinput[6] param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob))) prob = svm.svm_problem(output, input) target = (c_double * prob.l)() posclass = output[0] fold_start = (c_int *1)(); fold_start[0] = -1; libsvm.svm_cross_validation(prob, fold_start, param, nf, target) ys = prob.y[:prob.l] db = array([[ys[i],target[i]] for i in range(prob.l)]) del target neg = len([x for x in ys if x != posclass]) pos = prob.l-neg [topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posval,perfmetric) return topacc,topphi,minfpfnratio,topf1,auc,optbias
def __init__(self, data_dictionary, model_target, kernel=LINEAR, cv_segments=10, **args): #Create an SVM model object #Check to see if a threshold has been specified in the function's arguments try: self.threshold = args['threshold'] except KeyError: self.threshold=2.3711 # if there is no 'threshold' key, then use the default (2.3711) #Store some object data model_dict = deepcopy(data_dictionary) self.model_target = model_target self.folds = cv_segments #Label the exceedances in the training set. model_dict[model_target] = self.Assign_Labels(model_dict[model_target]) #Extract the training labels and training set self.training_labels = model_dict.pop(model_target) self.training_set = np.transpose(model_dict.values()) self.headers = model_dict.keys() #Scale the covariates to [-1,1] self.Scale_Covariates() #Generate an SVM model. self.svm_problem = svm.svm_problem(self.training_labels, self.training_set) self.svm_params = {'kernel_type' : kernel, 'weight_label' : [0,1], 'weight' : [10,1]} self.model=svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params)) #Use cross-validation to find the best number of components in the model. self.Select_Linear_Model(-5, 10) #Rebuild the model, calculating the probabilities of class membership self.svm_params['probability']=1 self.model=svm.svm_model(self.svm_problem, svm.svm_parameter(**self.svm_params))
def svm(y,K,**param_kw): """ Solve the SVM problem. Return ``(alpha, b)`` `y` labels `K` precopmuted kernel matrix Additional keyword arguments are passed on as svm parameters to the model. The wrapper is needed to precondition the precomputed matrix for use with libsvm, and to extract the model parameters and convert them into the canonical weight vector plus scalar offset. Normally libsvm hides these model paramters, preferring instead to provide a high-level model object that can be queried for results. """ i = arange(1,len(K)+1).reshape((-1,1)) X = hstack((i, K)) y = asarray(y,dtype=double) X = asarray(X,dtype=double) prob = svm_problem(y,X) param = svm_parameter(kernel_type=PRECOMPUTED,**param_kw) model = svm_model(prob, param) return get_alpha_b(model)
def iqr_model_train(matrix_kernel_train, labels_train, idx2clipid, svm_para = '-w1 50 -t 4 -b 1 -c 1'): """ Light-weighted SVM learning module for online IQR @param matrix_kernel_train: n-by-n square numpy array with kernel values between training data @param labels_train: row-wise labels of training data (1 or True indicates positive, 0 or False otherwise @param idx2clipid: idx2clipid(row_idx) returns the clipid for the 0-base row in matrix @param svm_para: (optional) SVM learning parameter @rtype: dictionary with 'clipids_SV': list of clipids for support vectors @return: output as a dictionary with 'clipids_SV' """ log = logging.getLogger('iqr_model_train') # set training inputs matrix_kernel_train = np.vstack((np.arange(1, len(matrix_kernel_train)+1), matrix_kernel_train)).T log.debug("Done matrix_kernel_train") problem = svm.svm_problem(labels_train.tolist(), matrix_kernel_train.tolist(), isKernel=True) log.debug("Done problem") svm_param = svm.svm_parameter(svm_para) log.debug("Done svm_param") # train model model = svmutil.svm_train(problem, svm_param) log.debug("Done train model") # release memory del problem del svm_param log.debug("Done release memory") # check learning failure if model.l == 0: raise Exception('svm model learning failure') log.debug("Done checking learning failure (no failure)") n_SVs = model.l clipids_SVs = [] idxs_train_SVs = svmtools.get_SV_idxs_nonlinear_svm(model) for i in range(n_SVs): _idx_1base = idxs_train_SVs[i] _idx_0base = _idx_1base - 1 clipids_SVs.append(idx2clipid[_idx_0base]) model.SV[i][0].value = i+1 # within SVM model, index needs to be 1-base log.debug("Done collecting support vector IDs") #svmutil.svm_save_model(filepath_model, model) output = dict() output['model'] = model output['clipids_SVs'] = clipids_SVs return output
def _test_evaluation(self, allow_slow): """ Test that the same predictions are made """ from svm import svm_parameter, svm_problem from svmutil import svm_train, svm_predict # Generate some smallish (poly kernels take too long on anything else) random data x, y = [], [] for _ in range(50): cur_x1, cur_x2 = random.gauss(2, 3), random.gauss(-1, 2) x.append([cur_x1, cur_x2]) y.append(1 + 2 * cur_x1 + 3 * cur_x2) input_names = ["x1", "x2"] df = pd.DataFrame(x, columns=input_names) prob = svm_problem(y, x) # Parameters base_param = "-s 3" # model type is epsilon SVR non_kernel_parameters = [ "", "-c 1.5 -p 0.5 -h 1", "-c 0.5 -p 0.5 -h 0" ] kernel_parameters = [ "", "-t 2 -g 1.2", # rbf kernel "-t 0", # linear kernel "-t 1", "-t 1 -d 2", "-t 1 -g 0.75", "-t 1 -d 0 -g 0.9 -r 2", # poly kernel "-t 3", "-t 3 -g 1.3", "-t 3 -r 0.8", "-t 3 -r 0.8 -g 0.5", # sigmoid kernel ] for param1 in non_kernel_parameters: for param2 in kernel_parameters: param_str = " ".join([base_param, param1, param2]) print(param_str) param = svm_parameter(param_str) model = svm_train(prob, param) (df["prediction"], _, _) = svm_predict(y, x, model) spec = libsvm.convert(model, input_names=input_names, target_name="target") if _is_macos() and _macos_version() >= (10, 13): metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics["max_error"], 0) if not allow_slow: break if not allow_slow: break
def train(self, session, doc): # doc here is [[class,...], [{vector},...]] (labels, vectors) = doc.get_raw(session) problem = svm.svm_problem(labels, vectors) self.model = svm.svm_model(problem, self.param) modelPath = self.get_path(session, 'modelPath') self.model.save(str(modelPath)) self.predicting = 1
def trainSVM(kernel, labels): #need to add an id number as the first column of the list svmKernel = column_stack((arange(1, len(kernel.tolist()) + 1), kernel)) prob = svm_problem(labels.tolist(), svmKernel.tolist(), isKernel=True) param = svm_parameter('-t 4') model = svm_train(prob, param) return model
def _test_evaluation(self, allow_slow): """ Test that the same predictions are made """ from svm import svm_parameter, svm_problem from svmutil import svm_train, svm_predict # Generate some smallish (poly kernels take too long on anything else) random data x, y = [], [] for _ in range(50): cur_x1, cur_x2 = random.gauss(2, 3), random.gauss(-1, 2) x.append([cur_x1, cur_x2]) y.append(1 + 2 * cur_x1 + 3 * cur_x2) input_names = ['x1', 'x2'] df = pd.DataFrame(x, columns=input_names) prob = svm_problem(y, x) # Parameters base_param = '-s 3' # model type is epsilon SVR non_kernel_parameters = [ '', '-c 1.5 -p 0.5 -h 1', '-c 0.5 -p 0.5 -h 0' ] kernel_parameters = [ '', '-t 2 -g 1.2', # rbf kernel '-t 0', # linear kernel '-t 1', '-t 1 -d 2', '-t 1 -g 0.75', '-t 1 -d 0 -g 0.9 -r 2', # poly kernel '-t 3', '-t 3 -g 1.3', '-t 3 -r 0.8', '-t 3 -r 0.8 -g 0.5' # sigmoid kernel ] for param1 in non_kernel_parameters: for param2 in kernel_parameters: param_str = ' '.join([base_param, param1, param2]) print(param_str) param = svm_parameter(param_str) model = svm_train(prob, param) (df['prediction'], _, _) = svm_predict(y, x, model) spec = libsvm.convert(model, input_names=input_names, target_name='target') metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics['max_error'], 0) if not allow_slow: break if not allow_slow: break
def train(self, dataset): """ Trains the svm classifier. Converts words to real numbers for training as SVM expects only numbers. """ super(SvmLearner, self).train(dataset) prob = svm.svm_problem(self.results, self.observations) param = svm.svm_parameter(kernel_type=svm.LINEAR, C=10, probability=1) self.model = svm.svm_model(prob, param)
def train(self,trainset): """ Trains the SVM. """ self.n_classes = len(trainset.metadata['targets']) # Set LIBSVM parameters kernel_types = {'linear':libsvm.LINEAR,'polynomial':libsvm.POLY, 'rbf':libsvm.RBF,'sigmoid':libsvm.SIGMOID} if self.kernel not in kernel_types: raise ValueError('Invalid kernel: '+self.kernel+'. Should be either \'linear\', \'polynomial\', \'rbf\' or \'sigmoid\'') if self.label_weights != None: class_to_id = trainset.metadata['class_to_id'] nr_weight = self.n_classes weight_label = range(self.n_classes) weight = [1]*self.n_classes for k,v in self.label_weights.iteritems(): weight[class_to_id[k]] = v else: nr_weight = 0 weight_label = [] weight = [] libsvm_params = libsvm.svm_parameter(svm_type = libsvm.C_SVC, kernel_type = kernel_types[self.kernel], degree=self.degree, gamma=self.gamma, coef0=self.coef0, C=self.C, probability=int(self.output_probabilities), cache_size=self.cache_size, eps=self.tolerance, shrinking=int(self.shrinking), nr_weight = nr_weight, weight_label = weight_label, weight = weight) # Put training set in the appropriate format: # if is sparse (i.e. a pair), inputs are converted to dictionaries # if not, inputs are assumed to be sequences and are kept intact libsvm_inputs = [] libsvm_targets = [] for input,target in trainset: if type(input) == tuple: libsvm_inputs += [dict(zip(input[1],input[0]))] else: libsvm_inputs += [input] libsvm_targets += [float(target)] # LIBSVM requires double-valued targets libsvm_problem = libsvm.svm_problem(libsvm_targets,libsvm_inputs) # Train SVM self.svm = libsvm.svm_model(libsvm_problem,libsvm_params)
def leave_one_out(y, x, param, n='DUMMY'): results = [] for i, test in enumerate(zip(y, x)): training_y = y[:i] + y[i+1:] training_x = x[:i] + x[i+1:] problem = svm.svm_problem(training_y, training_x) model = svmutil.svm_train(problem, param, '-q') result = svmutil.svm_predict(y[i:i+1], x[i:i+1], model, '-b 1') results.append(result + (test[0], make_d.decode(x[i], make_d.decode_dic))) return results
def iterGridSearchSVM(self, c_info=None, g_info=None, fold=5, probability=False, compensation=True): swap = lambda a, b: (b, a) if not c_info is None and len(c_info) >= 3: c_begin, c_end, c_step = c_info[:3] else: c_begin, c_end, c_step = -5, 15, 2 if c_end < c_begin: c_begin, c_end = swap(c_begin, c_end) c_step = abs(c_step) if not g_info is None and len(g_info) >= 3: g_begin, g_end, g_step = g_info[:3] else: g_begin, g_end, g_step = -15, 3, 2 if g_end < g_begin: g_begin, g_end = swap(g_begin, g_end) g_step = abs(g_step) labels, samples = self.getData(normalize=True) problem = svm.svm_problem(labels, samples) if compensation: weight, weight_label = self._calculateCompensation(labels) n = (c_end - c_begin) / c_step + 1 n *= (g_end - g_begin) / g_step + 1 l2c = c_begin while l2c <= c_end: l2g = g_begin while l2g <= g_end: param = svm.svm_parameter(kernel_type=svm.RBF, C=2.**l2c, gamma=2.**l2g, probability=1 if probability else 0) if compensation: param.weight = weight param.weight_label = weight_label param.nr_weight = len(weight) predictions = svm.cross_validation(problem, param, fold) predictions = map(int, predictions) conf = ConfusionMatrix.from_lists(labels, predictions, self.class_names.keys()) yield n, l2c, l2g, conf l2g += g_step l2c += c_step
def do_one_cv_classify_valid(theinput): c = theinput[0] gamma = theinput[1] nf = theinput[2] output = theinput[3] input = theinput[4] output_valid = theinput[5] input_valid = theinput[6] useprob = theinput[7] perfmetric = theinput[8] param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob))) prob = svm.svm_problem(output, input) prob_valid = svm.svm_problem(output_valid, input_valid) target = (c_double * prob_valid.l)() posclass = output[0] fold_start = (c_int *1)(); fold_start[0] = -1; fold_start_valid = (c_int *1)(); fold_start_valid[0] = -1; libsvm.svm_cross_validation_sepsets(prob, prob_valid,fold_start,fold_start_valid, param, nf, target) ys = prob.y[:prob_valid.l] db = array([[ys[i],target[i]] for i in range(prob_valid.l)]) del target neg = len([x for x in ys if x != posclass]) pos = prob_valid.l-neg [topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posval,perfmetric) return topacc,topphi,minfpfnratio,topf1,auc,optbias
def trainmodel(self,train,cv,test,modelsavepath): y,x = svmutil.svm_read_problem(train)#读入训练数据 # ycv,xcv = svm_read_problem(cv)#读入验证集 # ytest,xtest=svm_read_problem(test)#读入测试集 prob = svm.svm_problem(y, x) param = svm.svm_parameter('-t 2 -c 0.5 -g 0.125 -b 1') model = svmutil.svm_train(prob, param) yt,xt = svmutil.svm_read_problem(train)#??????????? p_labs, p_acc, p_vals = svmutil.svm_predict(yt, xt, model,'-b 1') svmutil.svm_save_model(modelsavepath, model)#save model # model = svmutil.svm_load_model('model_file')#读取model pass
def iterGridSearchSVM(self, c_info=None, g_info=None, fold=5, probability=False, compensation=True): swap = lambda a,b: (b,a) if not c_info is None and len(c_info) >= 3: c_begin, c_end, c_step = c_info[:3] else: c_begin, c_end, c_step = -5, 15, 2 if c_end < c_begin: c_begin, c_end = swap(c_begin, c_end) c_step = abs(c_step) if not g_info is None and len(g_info) >= 3: g_begin, g_end, g_step = g_info[:3] else: g_begin, g_end, g_step = -15, 3, 2 if g_end < g_begin: g_begin, g_end = swap(g_begin, g_end) g_step = abs(g_step) labels, samples = self.getData(normalize=True) #print len(labels), len(samples) problem = svm.svm_problem(labels, samples) if compensation: weight, weight_label = self._calculateCompensation(labels) n = (c_end - c_begin) / c_step + 1 n *= (g_end - g_begin) / g_step + 1 l2c = c_begin while l2c <= c_end: l2g = g_begin while l2g <= g_end: param = svm.svm_parameter(kernel_type=svm.RBF, C=2.**l2c, gamma=2.**l2g, probability=1 if probability else 0) if compensation: param.weight = weight param.weight_label = weight_label param.nr_weight = len(weight) predictions = svm.cross_validation(problem, param, fold) predictions = map(int, predictions) #print n,c,g conf = ConfusionMatrix.from_lists(labels, predictions, self.l2nl) yield n,l2c,l2g,conf l2g += g_step l2c += c_step
def learnModel(self, train_y, train_X): # scale train data svmScaler = preprocessing.MinMaxScaler(feature_range = (-1, 1)) train_X_scaledArr = svmScaler.fit_transform(train_X) # learn and save svm model X = train_X_scaledArr.tolist() problem = svm_problem(train_y, X) paramStr = '-c ' + str(self._param_c) + ' -g ' + str(self._param_g) + ' -q' param = svm_parameter(paramStr) self._model = svm_train(problem, param) self._scaler = svmScaler
def leave_one_out(y, x, param, n="DUMMY"): results = [] for i, test in enumerate(zip(y, x)): training_y = y[:i] + y[i + 1 :] training_x = x[:i] + x[i + 1 :] problem = svm.svm_problem(training_y, training_x) # t0 = time.clock() model = svmutil.svm_train(problem, param, "-q") # t1 = time.clock() # print 'Training took', t1 - t0, 'seconds.' result = svmutil.svm_predict(y[i : i + 1], x[i : i + 1], model, "-b 1") results.append(result + (test[0], make_d.decode(x[i], make_d.decode_dic))) return results
def lib_svm(self, train_file, test_file, digit0, digit1): features, labels = self.get_data(train_file, digit0, digit1) training_data = svm_problem(labels, features) if (self.kernel == 'gaussian'): params = svm_parameter('-s 0 -t 2 -c 1 -g 0.05') else: params = svm_parameter('-s 0 -t 2 -c 1 -g 0.001275') model = svm_train(training_data, params) test_features, test_labels = self.get_data(test_file, digit0, digit1) p_labels, p_acc, p_vals = svm_predict(test_labels, test_features, model)
def train_test_svm(self): logging.debug("TRAINING Samples: " + str(len(self._running_X_train))) logging.debug("TESTING Samples: " + str(len(self._running_X_test))) svm_problem = svm.svm_problem(self._running_y_train, self._running_X_train) self.svm_model = svmutil.svm_train(svm_problem, self.svm_param) predicted_labels, predicted_mse, predicted_probs = \ svmutil.svm_predict(self._running_y_test, self._running_X_test, self.svm_model, "-b 1") res = sklearn.metrics.accuracy_score(self._running_y_test, predicted_labels) self.refinement_results.append(res) print("RESULT: " + str(res * 100))
def lib_svm(train_file, test_file, kernel): print("inside libsvm") features, labels = get_data_from_csv(train_file) print(features) training_data = svm_problem(labels, features) if (kernel == 'gaussian'): params = svm_parameter('-s 0 -t 2 -c 1 -g 0.05') else: params = svm_parameter('-s 0 -t 2 -c 1 -g 0.001275') model = svm_train(training_data, params) test_features, test_labels = get_data_from_csv(test_file) p_labels, p_acc, p_vals = svm_predict(test_labels, test_features, model) return p_labels, p_acc, p_vals
def test(word, documents): import svm, random docs = [d.copy() for d in documents if d[reverse_map[word]]] nondocs = [d.copy() for d in documents if not d[reverse_map[word]]] nondocs = random.sample(nondocs, min(5 * len(docs), len(nondocs))) print float(len(nondocs)) / (len(docs) + len(nondocs)) cats = [1 for i in docs] + [0 for i in nondocs] obs = docs + nondocs for i in xrange(len(obs)): obs[i][reverse_map[word]] = 0. zobs = zip(obs, cats) random.shuffle(zobs) obs, cats = zip(*zobs) params = svm.svm_parameter(C=1, kernel_type=svm.LINEAR) problem = svm.svm_problem(cats, obs) target = svm.cross_validation(problem, params, 20) return sum(target[i] == cats[i] for i in cats) / float(len(cats))
def test(word, documents): import svm,random docs = [d.copy() for d in documents if d[reverse_map[word]]] nondocs = [d.copy() for d in documents if not d[reverse_map[word]]] nondocs = random.sample(nondocs,min(5*len(docs),len(nondocs))) print float(len(nondocs))/(len(docs)+len(nondocs)) cats = [1 for i in docs] + [0 for i in nondocs] obs = docs + nondocs for i in xrange(len(obs)): obs[i][reverse_map[word]] = 0. zobs = zip(obs,cats) random.shuffle(zobs) obs,cats = zip(*zobs) params = svm.svm_parameter(C=1, kernel_type=svm.LINEAR) problem = svm.svm_problem(cats,obs) target = svm.cross_validation(problem,params,20) return sum(target[i] == cats[i] for i in cats)/float(len(cats))
def do_one_cv(theinput): nu = theinput[0] c = theinput[1] gamma = theinput[2] nf = theinput[3] output = theinput[4] input = theinput[5] bins = theinput[6] param = svm.svm_parameter('-s %d -t %d -n %g -c %g -g %g' % (svm.NU_SVR,svm.RBF,nu,c,gamma)) prob = svm.svm_problem(output, input) target = (c_double * prob.l)() fold_start = (c_int *1)(); fold_start[0] = -1; libsvm.svm_cross_validation_labeltargets(prob, fold_start,param, nf, target) MSE,SCC = evaluations(prob.y[:prob.l],target[:prob.l],bins) del target return MSE,SCC
def do_one_cv_classify_predeffolds(theinput): c = theinput[0] gamma = theinput[1] nf = theinput[2] output = theinput[3] input = theinput[4] useprob = theinput[5] fold_start = theinput[6] perfmetric = theinput[7] param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob))) prob = svm.svm_problem(output, input) fold_start_p = (c_int *len(fold_start))() for i in xrange(len(fold_start)): fold_start_p[i] = fold_start[i] target = (c_double * prob.l)() posclass = output[0] # print prob libsvm.svm_cross_validation(prob, fold_start_p, param, nf, target) ys = prob.y[:prob.l] db = array([[ys[i],target[i]] for i in range(prob.l)]) # print db del target del fold_start_p neg = len([x for x in ys if x != posclass]) # print neg pos = prob.l-neg # print pos # print fb,neg,pos,posclass,perfmetric [topacc,topphi,minfpfnratio,topf1,auc,optbias] = optimize_results(db,neg,pos,posclass,perfmetric) return topacc,topphi,minfpfnratio,topf1,auc,optbias
def train(self, search=False, **kwargs): """ Train the SVM on the dataset. For RBF kernels (the default), an optional meta-parameter search can be performed. :key search: optional name of grid search class to use for RBF kernels: 'GridSearch' or 'GridSearchDOE' :key log2g: base 2 log of the RBF width parameter :key log2C: base 2 log of the slack parameter :key searchlog: filename into which to dump the search log :key others: ...are passed through to the grid search and/or libsvm """ self.setParams(**kwargs) problem = svm_problem(self.ds['target'].flatten(), self.ds['input'].tolist()) if search: # this is a bit of a hack... model = eval(search + "(problem, self.svmtarget, cmin=[0,-7],cmax=[25,1], cstep=[0.5,0.2],plotflag=self.plot,searchlog=self.searchlog,**self.params)") else: param = svm_parameter(**self.params) model = svm_model(problem, param) logging.info("Training completed with parameters:") logging.info(repr(param)) self.svm.setModel(model)
def do_one_cv_classify_multi(theinput): c = theinput[0] gamma = theinput[1] nf = theinput[2] output = theinput[3] input = theinput[4] useprob = theinput[5] param = svm.svm_parameter('-c %g -g %g -b %d' % (c,gamma,int(useprob))) prob = svm.svm_problem(output, input) target = (c_double * prob.l)() posclass = output[0] fold_start = (c_int *1)(); fold_start[0] = -1; libsvm.svm_cross_validation_labeltargets(prob, fold_start,param, nf, target) acc = len([i for i in xrange(len(output)) if output[i] == target[i]])*1.0/prob.l return acc
def bench_svm(X, Y, T): """ bench with swig-generated wrappers that come with libsvm """ import svm X1 = X.tolist() Y1 = Y.tolist() T1 = T.tolist() gc.collect() # start time tstart = datetime.now() problem = svm.svm_problem(Y1, X1) param = svm.svm_parameter(svm_type=0, kernel_type=0) model = svm.svm_model(problem, param) for i in T.tolist(): model.predict(i) delta = (datetime.now() - tstart) # stop time svm_results.append(delta.seconds + delta.microseconds/mu_second)
def train(self, examples, parameters=None): self.isBinary = self.isBinaryProblem(examples) examples = self.filterTrainingSet(examples) ExampleUtils.writeExamples(examples, self.tempDir+"/train.dat") #prepare parameters: if parameters.has_key("c"): assert(not parameters.has_key("C")) parameters["C"] = parameters["c"] del parameters["c"] totalExamples = float(sum(self.classes.values())) weight_label = self.classes.keys() weight_label.sort() weight = [] for k in weight_label: weight.append(1.0-self.classes[k]/totalExamples) libSVMparam = svm.svm_parameter(nr_weight = len(self.classes), weight_label=weight_label, weight=weight, **parameters) labels = [] samples = [] for example in examples: labels.append(example[1]) samples.append(example[2]) problem = svm.svm_problem(labels, samples) self.model = svm.svm_model(problem, libSVMparam)
def do_training(classifier_name, train_x, train_y, test_x, test_y): model_save_file = str('./models/') + classifier_name + str('.model') if classifier_name == 'LIBSVM': prob = svm_problem( np.array(train_y).tolist(), np.array(train_x).tolist()) param = svm_parameter('-s 1 -t 1 -q -d 3') # param = svm_parameter('-t 2 -q') model = svm_train(prob, param) svm_save_model('./models/{}.model'.format(classifier_name), model) svm_predict( np.array(test_y).tolist(), np.array(test_x).tolist(), model) return model model_save = {} classifiers = { 'NB': naive_bayes_classifier, 'KNN': knn_classifier, 'LR': logistic_regression_classifier, 'RF': random_forest_classifier, 'DT': decision_tree_classifier, 'SVM': svm_classifier, 'SVMCV': svm_cross_validation, 'GBDT': gradient_boosting_classifier, 'ADA': ada_boosting_classifier, 'MLP': mlp_classifier, 'XGBOOST': xgboost_classifier } model = classifiers[classifier_name](train_x, train_y) model_save[classifier_name] = model predict = model.predict(test_x) accuracy = metrics.accuracy_score(test_y, predict) print('accuracy: %.2f%%' % (100 * accuracy)) jl.dump(model_save, model_save_file) return model
def train(request): points = models.Point2d.objects.all() # Storing the information to be presented to SVM labels = [] inputs = [] # For each point, store the information into arrays for p in points: labels.append(p.label) inputs.append([p.x, p.y]) prob = svm.svm_problem(labels, inputs) param = svm.svm_parameter('-t 2 -c 100') model = svmutil.svm_train(prob, param) try: svmutil.svm_save_model('libsvm.model', model) except Exception as e: print "error: ", e, "\n" data = {"status": "trained"} return json(data)
def train(request): points = models.Point2d.objects.all() # Storing the information to be presented to SVM labels = [] inputs = [] # For each point, store the information into arrays for p in points: labels.append( p.label ) inputs.append([p.x, p.y]) prob = svm.svm_problem(labels, inputs) param = svm.svm_parameter('-t 2 -c 100') model = svmutil.svm_train(prob, param) try: svmutil.svm_save_model('libsvm.model', model) except Exception as e: print "error: ", e, "\n" data = {"status": "trained"} return json(data)
def train(self, c, g, probability=True, compensation=True, path=None, filename=None, save=True): if filename is None: filename = splitext(self.arff_file)[0] filename += '.model' if path is None: path = self.data_dir param = svm.svm_parameter(kernel_type=svm.RBF, C=c, gamma=g, probability=1 if probability else 0) labels, samples = self.getData(normalize=True) # because we train the SVM with dict we need to redefine the zero-insert self.has_zero_insert = False if not self.classifier is None: self.classifier.setOption('hasZeroInsert', True) if compensation: weight, weight_label = self._calculateCompensation(labels) param.weight = weight param.weight_label = weight_label param.nr_weight = len(weight) problem = svm.svm_problem(labels, samples) model = svm.svm_model(problem, param) if save: model.save(os.path.join(path, filename)) return problem, model
def rank(self, pos, neg): """ Rank the currently indexed elements given ``pos`` positive and ``neg`` negative exemplar descriptor elements. :param pos: Iterable of positive exemplar DescriptorElement instances. This may be optional for some implementations. :type pos: collections.Iterable[smqtk.representation.DescriptorElement] :param neg: Iterable of negative exemplar DescriptorElement instances. This may be optional for some implementations. :type neg: collections.Iterable[smqtk.representation.DescriptorElement] :return: Map of indexed descriptor elements to a rank value between [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0 meaning least relevant. :rtype: dict[smqtk.representation.DescriptorElement, float] """ # Notes: # - Pos and neg exemplars may be in our index. # # SVM model training # # Copy pos descriptors into a set for repeated iteration #: :type: set[smqtk.representation.DescriptorElement] pos = set(pos) # Creating training matrix and labels train_labels = [] train_vectors = [] num_pos = 0 for d in pos: train_labels.append(+1) train_vectors.append(d.vector().tolist()) num_pos += 1 self._log.debug("Positives given: %d", num_pos) # When no negative examples are given, naively pick most distant example # in our dataset, using HI metric, for each positive example neg_autoselect = set() if not neg: self._log.info( "Auto-selecting negative examples. (%d per positive)", self._autoneg_select_ratio) # ``train_vectors`` only composed of positive examples at this point for p in pos: # where d is the distance vector to descriptor elements in cache d = histogram_intersection_distance(p.vector(), self._descr_matrix) # Scan vector for max distance index # - Allow variable number of maximally distance descriptors to # be picked per positive. m_set = {} # track most distance neighbors m_val = -float( 'inf') # track smallest distance of most distant neighbors for i in xrange(d.size): if d[i] > m_val: m_set[d[i]] = i if len(m_set) > self._autoneg_select_ratio: if m_val in m_set: del m_set[m_val] m_val = min(m_set) for i in m_set.itervalues(): neg_autoselect.add(self._descr_cache[i]) # Remove any positive examples from auto-selected results neg_autoselect.difference_update(pos) self._log.debug("Auto-selected negative descriptors [%d]: %s", len(neg_autoselect), neg_autoselect) num_neg = 0 for d in neg: train_labels.append(-1) train_vectors.append(d.vector().tolist()) num_neg += 1 for d in neg_autoselect: train_labels.append(-1) train_vectors.append(d.vector().tolist()) num_neg += 1 if not num_pos: raise ValueError("No positive examples provided.") elif not num_neg: raise ValueError("No negative examples provided.") # Training SVM model self._log.debug("online model training") svm_problem = svm.svm_problem(train_labels, train_vectors) svm_model = svmutil.svm_train( svm_problem, self._gen_svm_parameter_string(num_pos, num_neg)) if svm_model.l == 0: raise RuntimeError("SVM Model learning failed") # # Platt Scaling for probability rankings # self._log.debug("making test distance matrix") # Number of support vectors # Q: is this always the same as ``svm_model.l``? num_SVs = sum(svm_model.nSV[:svm_model.nr_class]) # Support vector dimensionality dim_SVs = len(train_vectors[0]) # initialize matrix they're going into svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float) for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]): svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]] # compute matrix of distances from support vectors to index elements # TODO: Optimize this step by caching SV distance vectors # - It is known that SVs are vectors from the training data, so # if the same descriptors are given to this function # repeatedly (which is the case for IQR), this can be faster # because we're only computing at most a few more distance # vectors against our indexed descriptor matrix, and the rest # have already been computed before. # - At worst, we're effectively doing this call because each SV # needs to have its distance vector computed. svm_test_k = compute_distance_matrix(svm_SVs, self._descr_matrix, histogram_intersection_distance, row_wise=True) self._log.debug("Platt scalling") # the actual platt scaling stuff weights = numpy.array(svm_model.get_sv_coef()).flatten() margins = numpy.dot(weights, svm_test_k) rho = svm_model.rho[0] probA = svm_model.probA[0] probB = svm_model.probB[0] #: :type: numpy.core.multiarray.ndarray probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB)) # Detect whether we need to flip probabilities # - Probability of input positive examples should have a high # probability score among the generated probabilities of our index. # - If the positive example probabilities show to be in the lower 50%, # flip the generated probabilities, since its experimentally known # that the SVM will change which index it uses to represent a # particular class label occasionally, which influences the Platt # scaling apparently. pos_vectors = numpy.array(train_vectors[:num_pos]) pos_test_k = compute_distance_matrix(svm_SVs, pos_vectors, histogram_intersection_distance, row_wise=True) pos_margins = numpy.dot(weights, pos_test_k) #: :type: numpy.core.multiarray.ndarray pos_probs = 1.0 / (1.0 + numpy.exp((pos_margins - rho) * probA + probB)) # Check if average positive probability is less than the average index # probability. If so, the platt scaling probably needs to be flipped. if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size): self._log.debug("inverting probabilities") probs = 1. - probs rank_pool = dict(zip(self._descr_cache, probs)) return rank_pool
def rank( self, pos: Iterable[DescriptorElement], neg: Iterable[DescriptorElement] ) -> Dict[DescriptorElement, float]: """ Rank the currently indexed elements given ``pos`` positive and ``neg`` negative exemplar descriptor elements. :param pos: Iterable of positive exemplar DescriptorElement instances. This may be optional for some implementations. :type pos: collections.abc.Iterable[smqtk.representation.DescriptorElement] :param neg: Iterable of negative exemplar DescriptorElement instances. This may be optional for some implementations. :type neg: collections.abc.Iterable[smqtk.representation.DescriptorElement] :return: Map of indexed descriptor elements to a rank value between [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0 meaning least relevant. :rtype: dict[smqtk.representation.DescriptorElement, float] """ # Notes: # - Pos and neg exemplars may be in our index. # # SVM model training # # Copy pos descriptors into a set for repeated iteration pos_set: Set[DescriptorElement] = set(pos) # Creating training matrix and labels train_labels = [] train_vectors: List = [] num_pos = 0 for desc_element in pos_set: train_labels.append(+1) if desc_element.vector() is not None: train_vectors.append( desc_element.vector().tolist()) # type: ignore else: raise AttributeError num_pos += 1 LOG.debug(f"Positives given: {num_pos}") # When no negative examples are given, naively pick most distant # example in our dataset, using HI metric, for each positive example neg_autoselect = set() # Copy neg descriptors into a set for testing size. if not isinstance(neg, collections.abc.Sized): #: :type: set[smqtk.representation.DescriptorElement] neg = set(neg) if not neg: LOG.info(f"Auto-selecting negative examples. \ ({self.autoneg_select_ratio} per positive") # ``train_vectors`` only composed of positive examples at this # point. for p in pos_set: # Where d is the distance vector to descriptor elements in # cache. d = histogram_intersection_distance(p.vector(), self._descr_matrix) # Scan vector for max distance index # - Allow variable number of maximally distance descriptors to # be picked per positive. # track most distance neighbors m_set = {} # track smallest distance of most distant neighbors m_val = -float('inf') for i in range(d.size): if d[i] > m_val: m_set[d[i]] = i if len(m_set) > self.autoneg_select_ratio: if m_val in m_set: del m_set[m_val] m_val = min(m_set) for i in six.itervalues(m_set): neg_autoselect.add(self._descr_cache[i]) # Remove any positive examples from auto-selected results neg_autoselect.difference_update(pos_set) LOG.debug(f"Auto-selected negative descriptors \ [{len(neg_autoselect)}]: {neg_autoselect}") num_neg = 0 for n_iterable in (neg, neg_autoselect): for d in n_iterable: train_labels.append(-1) # noinspection PyTypeChecker train_vectors.append(d.vector().tolist()) num_neg += 1 if not num_pos: raise ValueError("No positive examples provided.") elif not num_neg: raise ValueError("No negative examples provided.") # Training SVM model LOG.debug("online model training") svm_problem = svm.svm_problem(train_labels, train_vectors) param_str = self._gen_svm_parameter_string(num_pos, num_neg) svm_param = svm.svm_parameter(param_str) svm_model = svmutil.svm_train(svm_problem, svm_param) if hasattr(svm_model, "param"): LOG.debug(f"SVM input parameters: {param_str}") LOG.debug(f"SVM model parsed parameters: {svm_model.param}") param = svm_model.param wgt_pairs = [(param.weight_label[i], param.weight[i]) for i in range(param.nr_weight)] wgt_str = " ".join(["%s: %s" % wgt for wgt in wgt_pairs]) LOG.debug(f"SVM model parsed weight parameters: {wgt_str}") if svm_model.l == 0: # noqa: E741 raise RuntimeError("SVM Model learning failed") # # Platt Scaling for probability rankings # LOG.debug("making test distance matrix") # Number of support vectors # Q: is this always the same as ``svm_model.l``? num_SVs = sum(svm_model.nSV[:svm_model.nr_class]) # Support vector dimensionality dim_SVs = len(train_vectors[0]) # initialize matrix they're going into svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float) for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]): svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]] # compute matrix of distances from support vectors to index elements # TODO: Optimize this step by caching SV distance vectors # - It is known that SVs are vectors from the training data, so # if the same descriptors are given to this function # repeatedly (which is the case for IQR), this can be faster # because we're only computing at most a few more distance # vectors against our indexed descriptor matrix, and the rest # have already been computed before. # - At worst, we're effectively doing this call because each SV # needs to have its distance vector computed. svm_test_k = compute_distance_matrix(svm_SVs, self._descr_matrix, histogram_intersection_distance, row_wise=True) # TODO(john.moeller): None of the Platt scaling should be necessary. # svmutil.svm_predict will apply the Platt scaling directly. See # https://github.com/cjlin1/libsvm/tree/master/python LOG.debug("Platt scaling") # the actual platt scaling stuff weights = numpy.array(svm_model.get_sv_coef()).flatten() margins = numpy.dot(weights, svm_test_k) rho = svm_model.rho[0] probA = svm_model.probA[0] probB = svm_model.probB[0] #: :type: numpy.core.multiarray.ndarray probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB)) # Detect whether we need to flip probabilities # - Probability of input positive examples should have a high # probability score among the generated probabilities of our index. # - If the positive example probabilities show to be in the lower 50%, # flip the generated probabilities, since its experimentally known # that the SVM will change which index it uses to represent a # particular class label occasionally, which influences the Platt # scaling apparently. pos_vectors = numpy.array(train_vectors[:num_pos]) pos_test_k = compute_distance_matrix(svm_SVs, pos_vectors, histogram_intersection_distance, row_wise=True) pos_margins = numpy.dot(weights, pos_test_k) #: :type: numpy.core.multiarray.ndarray pos_probs = 1.0 / (1.0 + numpy.exp((pos_margins - rho) * probA + probB)) # Check if average positive probability is less than the average index # probability. If so, the platt scaling probably needs to be flipped. if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size): LOG.debug("inverting probabilities") probs = 1. - probs rank_pool = dict(zip(self._descr_cache, probs)) return rank_pool
def train_SVR_Linear(self, labels, vectors, verbose, C_range, callback=None): '''Private use only''' # combine the labels and vectors into one set. data = [] for i in range(len(labels)): data.append([labels[i], vectors[i]]) #shuffle the data rng = random.Random() if self.random_seed != None: rng.seed(self.random_seed) rng.shuffle(data) # partition into validation and training if type( self.validation_size ) == float and self.validation_size > 0.0 and self.validation_size < 1.0: training_cutoff = int(len(data) * (1.0 - self.validation_size)) elif type(self.validation_size ) == int and self.validation_size < len(labels): training_cutoff = len(labels) - self.validation_size else: raise NotImplementedError( "Cannot determine validation set from %s" % self.validation_size) if verbose: print "Training Cutoff:", len(labels), training_cutoff training_data = data[:training_cutoff] validation_data = data[training_cutoff:] tmp_labels = [] tmp_vectors = [] for each in training_data: tmp_labels.append(each[0]) tmp_vectors.append(each[1]) prob = svm.svm_problem(tmp_labels, tmp_vectors) training_info = [] training_svm = [] training_table = Table() self.training_table = training_table i = 0 for C in C_range: param = svm.svm_parameter(svm_type=self.svm_type, kernel_type=svm.LINEAR, C=C, p=self.epsilon, nu=self.nu) test_svm = svm.svm_model(prob, param) mse = 0.0 total = len(validation_data) for label, vector in validation_data: pred = test_svm.predict(vector) error = label - pred mse += error * error mse = mse / total training_svm.append(test_svm) training_info.append([C, mse]) training_table.setElement(i, 'C', C) training_table.setElement(i, 'mse', mse) i += 1 if callback != None: callback(int(100 * float(i) / len(C_range))) if verbose: print if verbose: print "------------------------------" if verbose: print " Tuning Information:" if verbose: print " C error" if verbose: print "------------------------------" best = training_info[0] best_svm = training_svm[0] for i in range(len(training_info)): each = training_info[i] if verbose: print " %8.3e %0.8f" % (each[0], each[1]) if best[-1] > each[-1]: best = each best_svm = training_svm[i] if verbose: print "------------------------------" if verbose: print if verbose: print "------------------------------" if verbose: print " Best Tuning:" if verbose: print " C error" if verbose: print "------------------------------" if verbose: print " %8.3e %0.8f" % (best[0], best[1]) if verbose: print "------------------------------" if verbose: print self.training_info = training_info self.C = best[0] self.error = best[1] self.svm = best_svm
def train_SVR_Linear(self,labels,vectors,verbose, C_range, callback=None): '''Private use only''' # combine the labels and vectors into one set. data = [] for i in range(len(labels)): data.append([labels[i],vectors[i]]) #shuffle the data rng = random.Random() if self.random_seed != None: rng.seed(self.random_seed) rng.shuffle(data) # partition into validation and training if type(self.validation_size) == float and self.validation_size > 0.0 and self.validation_size < 1.0: training_cutoff = int(len(data)*(1.0-self.validation_size)) elif type(self.validation_size) == int and self.validation_size < len(labels): training_cutoff = len(labels)-self.validation_size else: raise NotImplementedError("Cannot determine validation set from %s"%self.validation_size) if verbose: print "Training Cutoff:",len(labels),training_cutoff training_data = data[:training_cutoff] validation_data = data[training_cutoff:] tmp_labels = [] tmp_vectors = [] for each in training_data: tmp_labels.append(each[0]) tmp_vectors.append(each[1]) prob = svm.svm_problem(tmp_labels,tmp_vectors) training_info = [] training_svm = [] training_table = Table() self.training_table = training_table i=0 for C in C_range: param = svm.svm_parameter(svm_type=self.svm_type,kernel_type = svm.LINEAR, C = C, p=self.epsilon,nu=self.nu) test_svm = svm.svm_model(prob, param) mse = 0.0 total = len(validation_data) for label,vector in validation_data: pred = test_svm.predict(vector) error = label - pred mse += error*error mse = mse/total training_svm.append(test_svm) training_info.append([C,mse]) training_table.setElement(i,'C',C) training_table.setElement(i,'mse',mse) i+=1 if callback != None: callback(int(100*float(i)/len(C_range))) if verbose: print if verbose: print "------------------------------" if verbose: print " Tuning Information:" if verbose: print " C error" if verbose: print "------------------------------" best = training_info[0] best_svm = training_svm[0] for i in range(len(training_info)): each = training_info[i] if verbose: print " %8.3e %0.8f"%(each[0],each[1]) if best[-1] > each[-1]: best = each best_svm = training_svm[i] if verbose: print "------------------------------" if verbose: print if verbose: print "------------------------------" if verbose: print " Best Tuning:" if verbose: print " C error" if verbose: print "------------------------------" if verbose: print " %8.3e %0.8f"%(best[0],best[1]) if verbose: print "------------------------------" if verbose: print self.training_info = training_info self.C = best[0] self.error = best[1] self.svm = best_svm
def main(args): paramsfn = args[0] exec(open(paramsfn,'r').read()) if len(args) > 1: gammarange = [float(args[1])] crange = [float(args[2])] output,input,fieldnames,fold_inds = load_data(datafilename,use_specific_fold_inds) sep_validation = False if separate_validation_set != '': output_valid,input_valid,fieldnames,fold_inds_valid = load_data(separate_validation_set,use_specific_fold_inds) sep_validation = True fold_start = [-1] if sep_validation: fold_start_valid = [-1] if use_specific_fold_inds: unique_fold_ids = unique(fold_inds) row_inds = [] outputcopy = [] inputcopy = zeros([size(input,0),size(input,1)],dtype='float64') fold_start = [0] curind = 0 for ind in unique_fold_ids: row_inds = [i for i in xrange(len(fold_inds)) if fold_inds[i] == ind] inputcopy[curind:curind+len(row_inds),:] = input[row_inds,:] outputcopy.extend([output[i] for i in row_inds]) curind += len(row_inds) fold_start.append(fold_start[-1]+len(row_inds)) input = inputcopy output = outputcopy nf = len(fold_start)-1 if sep_validation: unique_fold_ids_valid = unique(fold_inds_valid) row_inds = [] outputcopy = [] inputcopy = zeros([size(input_valid,0),size(input_valid,1)],dtype='float64') fold_start_valid = [0] curind = 0 for ind in unique_fold_ids_valid: row_inds = [i for i in xrange(len(fold_inds_valid)) if fold_inds_valid[i] == ind] inputcopy[curind:curind+len(row_inds),:] = input_valid[row_inds,:] outputcopy.extend([output_valid[i] for i in row_inds]) curind += len(row_inds) fold_start_valid.append(fold_start_valid[-1]+len(row_inds)) input_valid = inputcopy output_valid = outputcopy nf = len(fold_start_valid)-1 if binarizeoutput: output,boundary = binarize_output(output,binary_threshold,binary_boundary_type) if testdatafilename != '': output_test,input_test,fieldnames,fold_inds_test = load_data(testdatafilename,False) if binarizeoutput: output_test = [1 if x > boundary else -1 for x in output_test] if doscale: maxinput = input.max(0); mininput = input.min(0); input = (input-mininput)/(maxinput-mininput) if testdatafilename != '': input_test = (input_test-mininput)/(maxinput-mininput) if savemodel: save_scale_data(datafilename+'_scales.dat',maxinput,mininput) if sep_validation: input_valid = (input_valid-mininput)/(maxinput-mininput) if donormalize: means = input.mean(0) stds = sqrt(input.var(0)) input = (input-means)/stds if testdatafilename != '': input_test = (input_test-means)/stds if savemodel: save_zscore_data(datafilename+'_meansstdevs.dat',means,stds) if sep_validation: input_valid = (input_valid-means)/stds if numcpus == 'auto': p = Pool() else: p = Pool(numcpus) if choose_specific_features: if choose_specific_features_increasing: specific_selected_features = [specific_selected_features[:i] for i in xrange(2,len(specific_selected_features),2)] for specific_selected_choice in specific_selected_features: inputfiltered = input[:,specific_selected_choice] if sep_validation: inputfiltered_valid = input_valid[:,specific_selected_choice] if dopca: coeff,temp,latent = princomp(inputfiltered) if savemodel: save_pca_coeffs(datafilename+'_pcacoeffs.dat',coeff,mean(inputfiltered.T,axis=1)) inputfiltered = temp if sep_validation: return with Timer(): if sep_validation: if use_specific_fold_inds: results = mygrid.grid_classify_sepvalid (crange,gammarange,output,[list(x) for x in inputfiltered],output_valid,[list(x) for x in inputfiltered_valid],nf,useprob,timeout,p,fold_start,fold_start_valid) else: results = mygrid.grid_classify_sepvalid (crange,gammarange,output,[list(x) for x in inputfiltered],output_valid,[list(x) for x in inputfiltered_valid],nf,useprob,timeout,p) else: if use_specific_fold_inds: results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p,fold_start) else: results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p) param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) prob = svm.svm_problem(output, [list(x) for x in inputfiltered]) fold_start_p = (c_int *len(fold_start))() for i in xrange(len(fold_start)): fold_start_p[i] = fold_start[i] if posclass == 'auto': posclass = output[0] if sep_validation: prob_valid = svm.svm_problem(output_valid, [list(x) for x in inputfiltered_valid]) testlength = prob_valid.l fold_start_p_valid = (c_int *len(fold_start_valid))() for i in xrange(len(fold_start_valid)): fold_start_p_valid[i] = fold_start_valid[i] else: testlength = prob.l target = (c_double * testlength)() #[maxauc,maxoptacc,maxphi,minfpfnration,maxf1,optbias,optc,optgamma] if sep_validation: libsvm.svm_cross_validation_sepsets(prob, prob_valid,fold_start_p, fold_start_p_valid,param, nf, target) else: libsvm.svm_cross_validation(prob, fold_start_p, param, nf, target) if sep_validation: ys = prob_valid.y[:testlength] else: ys = prob.y[:prob.l] db = array([[ys[i],target[i]] for i in range(testlength)]) neg = len([x for x in ys if x != posclass]) pos = testlength-neg; if len(specific_selected_features) == 1 or True: pdfpages = PdfPages('%s_train.pdf' % (outputlog)) # auc,topacc,optaccbias,topphi,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],True,pdfpages,'Optimal Cross-Validation ROC curve') topacc,topphi,minfpfnratio,topf1,auc,optbias = mygrid.optimize_results(db,neg,pos,posclass,'F1') print [topacc,results[1]] print [topphi,results[2]] print [topf1,results[4]] print [auc,results[0]] pdfpages.close() # print target if sep_validation: ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output_valid,target,posclass,results[-3]) else: ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output,target,posclass,results[-3]) if posclass == 1: negclass = 0; else: negclass = 1; numpred_pos = confusionmatrix[0,0]+confusionmatrix[1,0] numpred_neg = confusionmatrix[0,1]+confusionmatrix[1,1] N = pos+neg probchance = (numpred_pos*pos+numpred_neg*neg)*1.0/(N*N) kappa = (topacc-probchance)*1.0/(1-probchance); print 'Train optimized accuracy = %g' % (topacc) print 'Train optimized Phi statistic = %g' % (topphi) print 'Train optimized kappa = %g' % (kappa) print 'Train optimized F1 score = %f' % (topf1) print 'Train optimized TP/RECALL = %g, FP = %g, PRECISION = %g' % (confusionmatrix[0,0]/pos,confusionmatrix[1,0]/neg,confusionmatrix[0,0]/(confusionmatrix[0,0]+confusionmatrix[1,0])) print '================================' print '|| ||%6d |%6d | ||' % (posclass,negclass) print '================================' print '||%3d||%6g |%6g |%6g ||' % (posclass,confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1]) print '||%3d||%6g |%6g |%6g ||' % (negclass,confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '||----------------------------||' print '|| ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '================================' else: auc,topacc,optaccbias,topphi,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],False,0,'Optimal Cross-Validation ROC curve') print 'Optimal gamma = %g\nOptimal c = %g\nOptimal Bias = %g' % (results[-1],results[-2],results[-3]) print 'Top CV results: AUC = %g, OPTIMIZED ACC = %g, OPTIMIZED PHI = %g' % (auc,topacc,topphi) if outputlog != '': fout = open(outputlog,'a') print >> fout, '=========================' print >> fout, datafilename print >> fout, doscale, donormalize, dopca, '(scale/norm/pca)' print >> fout, crange[0],crange[-1], gammarange[0], gammarange[-1], '(cs,gammas)' print >> fout, use_specific_fold_inds, nf, '(use specific folds, numfold)' print >> fout, 'SPECIFIC FIELDS:' print >> fout, specific_selected_choice if fieldnames != []: for i in specific_selected_choice: print >> fout, fieldnames[i], print >> fout print >> fout, 'train: ' print >> fout, ' AUC=%g,ACC=%g,kappa=%g,phi=%g,f1=%g (g=%g,c=%g,bias=%g)' % (auc,topacc,kappa,topphi,topf1,results[-1],results[-2],results[-3]) print >> fout, ' ||%3d||%6g |%6g |%6g ||' % (posclass,confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1]) print >> fout, ' ||%3d||%6g |%6g |%6g ||' % (negclass,confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) fout.close() if outputpredictions: fout = open(predictionslog,'w') if sep_validation: for ind in xrange(len(output_valid)): label = output_valid[ind] value = target[ind] oneinputrow = input_valid[ind,:] print >> fout, value, label, for j in xrange(len(oneinputrow)): print >> fout, '%d:%f' % (j+1,oneinputrow[j]), print >> fout else: for ind in xrange(len(output)): label = output[ind] value = target[ind] oneinputrow = input[ind,:] print >> fout, value, label, for j in xrange(len(oneinputrow)): print >> fout, '%d:%f' % (j+1,oneinputrow[j]), print >> fout fout.close() del target if savemodel: param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in inputfiltered],param) svm_save_model(datafilename + '.model',m) if testdatafilename != '': inputfiltered_test = input_test[:,specific_selected_choice] if dopca: M = (inputfiltered_test-mean(inputfiltered_test.T,axis=1)).T # subtract the mean (along columns) inputfiltered_test = dot(coeff.T,M).T # projection of the data in the new space param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in inputfiltered],param) pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in inputfiltered_test],m,'-b %d' % (int(useprob))) ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output_test, [x[0] for x in pred_values],posclass,results[-3]) db = array([[output_test[i],pred_values[i][0]] for i in range(len(output_test))]) neg = len([x for x in output_test if x != posclass]) pos = len(output_test)-neg auctest = 0 if neg != 0 and pos != 0: auctest,topacctest,optaccbias,topphitest,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],False,pdfpages,'Test ROC curve',results[-3]) numpred_pos = confusionmatrix[0,0]+confusionmatrix[1,0] numpred_neg = confusionmatrix[0,1]+confusionmatrix[1,1] N = pos+neg probchance = (numpred_pos*pos+numpred_neg*neg)*1.0/(N*N) testkappa = (ACC/100.0-probchance)*1.0/(1-probchance); print 'Test optimized accuracy = %g' % (ACC) print 'Test optimized Phi statistic = %g' % (PHI) print 'Test optimized kappa = %g' % (testkappa) print '================================' print '|| ||%6d |%6d | ||' % (m.get_labels()[0],m.get_labels()[1]) print '================================' print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[0],confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1]) print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[1],confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '||----------------------------||' print '|| ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '================================' if outputlog != '': fout = open(outputlog,'a') print >> fout, 'test: ' print >> fout, ' ACC=%g,AUC=%g,kappa=%g,phi=%g' % (ACC,auctest,testkappa,PHI) print >> fout, ' ||%3d||%6g |%6g |%6g ||' % (m.get_labels()[0],confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1]) print >> fout, ' ||%3d||%6g |%6g |%6g ||' % (m.get_labels()[1],confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) fout.close() else: with Timer(): if use_specific_fold_inds: results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in input],nf,useprob,timeout,p,fold_start) else: results = mygrid.grid_classify (crange,gammarange,output,[list(x) for x in input],nf,useprob,timeout,p) param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) prob = svm.svm_problem(output, [list(x) for x in input]) target = (c_double * prob.l)() fold_start_p = (c_int *len(fold_start))() for i in xrange(len(fold_start)): fold_start_p[i] = fold_start[i] if posclass == 'auto': posclass = output[0] libsvm.svm_cross_validation(prob, fold_start_p, param, nf, target) ys = prob.y[:prob.l] db = [[ys[i],target[i]] for i in range(prob.l)] db = array(db) neg = len([x for x in ys if x != posclass]) pos = prob.l-neg; pdfpages = PdfPages('%s_train.pdf' % (outputlog)) auc,topacc,optaccbias,topphi,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],True,pdfpages,'Optimal Cross-Validation ROC curve') pdfpages.close() ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output, target,posclass,results[-3]) if posclass == 1: negclass = 0; else: negclass = 1; print 'Train optimized accuracy = %g' % (topacc) print 'Train optimized phi statististic = %g' % (topphi) print 'TP/RECALL = %g, FP = %g, PRECISION = %g' % (confusionmatrix[0,0]/pos,confusionmatrix[1,0]/neg,confusionmatrix[0,0]/(confusionmatrix[0,0]+confusionmatrix[1,0])) print '================================' print '|| ||%6d |%6d | ||' % (posclass,negclass) print '================================' print '||%3d||%6g |%6g |%6g ||' % (posclass,confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1]) print '||%3d||%6g |%6g |%6g ||' % (negclass,confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '||----------------------------||' print '|| ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '================================' if outputpredictions: fout = open(predictionslog,'w') for ind in xrange(len(output)): label = output[ind] value = target[ind] oneinputrow = input[ind,:] print >> fout, value, label, for j in xrange(len(oneinputrow)): print >> fout, '%d:%f' % (j+1,oneinputrow[j]), print >> fout fout.close() del target print 'Optimal gamma = %g\nOptimal c = %g\nOptimal Bias = %g' % (results[-1],results[-2],optphibias) print 'Top CV results: AUC = %g, OPTIMIZED ACC = %g, OPTIMIZED PHI = %g' % (auc,topacc,topphi) if savemodel: param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in input],param) svm_save_model(datafilename+'.model',m) if testdatafilename != '': param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in input],param) pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in input_test],m,'-b %d' % (int(useprob))) ACC,PHI,confusionmatrix = mygrid.evaluations_classify(output_test, [x[0] for x in pred_values],posclass,results[-3]) db = array([[output_test[i],pred_values[i][0]] for i in range(len(output_test))]) neg = len([x for x in output_test if x != posclass]) pos = len(output_test)-neg; pdfpages = PdfPages('%s_test.pdf' % (outputlog)) auctest = 0 if neg != 0 and pos != 0: auctest,topacctest,optaccbias,topphitest,optphibias,top_tps_bias,top_fps = mygrid.calc_AUC(db,neg,pos,posclass,useprob,[],True,pdfpages,'Test ROC curve',results[-3]) pdfpages.close() print 'Test accuracy = %g' % (ACC) print 'Test Phi statistic = %g' % (PHI) print 'TP/RECALL = %g, FP = %g, PRECISION = %g' % (confusionmatrix[0,0]/pos,confusionmatrix[1,0]/neg,confusionmatrix[0,0]/(confusionmatrix[0,0]+confusionmatrix[1,0])) print '================================' print '|| ||%6d |%6d | ||' % (m.get_labels()[0],m.get_labels()[1]) print '================================' print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[0],confusionmatrix[0,0],confusionmatrix[0,1],pos)#confusionmatrix[0,0]+confusionmatrix[0,1]) print '||%3d||%6g |%6g |%6g ||' % (m.get_labels()[1],confusionmatrix[1,0],confusionmatrix[1,1],neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '||----------------------------||' print '|| ||%6g |%6g |%6g ||' % (confusionmatrix[0,0]+confusionmatrix[1,0],confusionmatrix[0,1]+confusionmatrix[1,1],pos+neg)#confusionmatrix[1,0]+confusionmatrix[1,1]) print '================================' if outputlog != '': fout = open(outputlog,'a') print >> fout, '=========================' print >> fout, fieldnames print >> fout, 'train: AUC=%g,ACC=%g,PHI=%g (g=%g,c=%g,bias=%g)' % (auc,topacc,topphi,results[-1],results[-2],results[-3]) if testdatafilename != '': print >> fout, 'test: ACC=%g,AUC=%g,PHI=%g' % (ACC,auctest,PHI) fout.close()
def train(self, class_examples=None, **kwds): """ Train the supervised classifier model. If a model is already loaded, we will raise an exception in order to prevent accidental overwrite. If the same label is provided to both ``class_examples`` and ``kwds``, the examples given to the reference in ``kwds`` will prevail. :param class_examples: Dictionary mapping class labels to iterables of DescriptorElement training examples. :type class_examples: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param kwds: Keyword assignment of labels to iterables of DescriptorElement training examples. :type kwds: dict[str, collections.Iterable[smqtk.representation.DescriptorElement]] :raises ValueError: There were no class examples provided. :raises ValueError: Less than 2 classes were given. :raises RuntimeError: A model already exists in this instance.Following through with training would overwrite this model. Throwing an exception for information protection. """ class_examples = \ super(LibSvmClassifier, self).train(class_examples, **kwds) # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.Sequence): g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count miss-match between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights for C-SVC SVM if '-s' not in params or int(params['-s']) == 0: total_examples = sum(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): # weight is the ratio of between number of other-class examples # to the number of examples in this class. other_class_examples = total_examples - n w = max(1.0, other_class_examples / float(n)) params['-w' + str(i)] = w self._log.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_fp: self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp) with open(self.svm_label_map_fp, 'wb') as f: cPickle.dump(self.svm_label_map, f, -1) if self.svm_model_fp: self._log.debug("saving file -- model -- %s", self.svm_model_fp) svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
def train(self, class_examples=None, **kwds): """ Train the supervised classifier model. If a model is already loaded, we will raise an exception in order to prevent accidental overwrite. If the same label is provided to both ``class_examples`` and ``kwds``, the examples given to the reference in ``kwds`` will prevail. :param class_examples: Dictionary mapping class labels to iterables of DescriptorElement training examples. :type class_examples: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param kwds: Keyword assignment of labels to iterables of DescriptorElement training examples. :type kwds: dict[str, collections.Iterable[smqtk.representation.DescriptorElement]] :raises ValueError: There were no class examples provided. :raises ValueError: Less than 2 classes were given. :raises RuntimeError: A model already exists in this instance.Following through with training would overwrite this model. Throwing an exception for information protection. """ class_examples = \ super(LibSvmClassifier, self).train(class_examples, **kwds) # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.Sequence): self._log.debug(' (expanding iterable into sequence)') g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count miss-match between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights for C-SVC SVM if '-s' not in params or int(params['-s']) == 0: total_examples = sum(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): # weight is the ratio of between number of other-class examples # to the number of examples in this class. other_class_examples = total_examples - n w = max(1.0, other_class_examples / float(n)) params['-w' + str(i)] = w self._log.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) del train_vectors self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_fp: self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp) with open(self.svm_label_map_fp, 'wb') as f: cPickle.dump(self.svm_label_map, f, -1) if self.svm_model_fp: self._log.debug("saving file -- model -- %s", self.svm_model_fp) svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
'''plot datas''' pyplot.plot(data_pos[:, 0], data_pos[:, 1], 'r.') pyplot.plot(data_neg[:, 0], data_neg[:, 1], 'answers.') pyplot.xlim(-2.5, 2.5) pyplot.ylim(-2, 2) '''plot items margin''' angles_circle = [i*pi/180 for i in range(0,360)] #iÏÈת»»³Édouble #angles_circle = [line/np.pi for line in np.arange(0,360)] # <=> # angles_circle = [line/180*pi for line in np.arange(0,360)] X x = cos(angles_circle) y = sin(angles_circle) pyplot.plot(x, y, 'r') pyplot.plot(2*x, 2*y, 'answers') pyplot.show() '''build a vec for classification''' data = np.append(data_pos, data_neg, axis = 0) #merge 2 ndarray datas into 1axis = 0!!! # print(items) # items = [data_pos, data_neg] X data = data.tolist() #transform ndarray datas into list # print(items) data_label = ones( (pos_dot_num + neg_dot_num, 1) ) data_label[11:20] = -1 prob = svm_problem(data_label, data) #items & data_label must be list param = svm_parameter('-c 100 -g 4') # print(param) model = svm_train(prob, param)
# split the line on commas, last element is truth label values = line.rstrip().split(',') arr = [] # convert elements to floats for i in range(0, 4): arr.append(float(values[i])) x.append(arr) if values[4] not in names_to_integers: raise TruthLabelMissingInDictError("Could not find \"" + values[4] + "\" in file") y.append(names_to_integers[values[4]]) #print values return x, y if __name__ == "__main__": flower_truth = { "Iris-setosa": 1, "Iris-versicolor": 2, "Iris-virginica": 3 } x, y = read_iris_dataset("../datasets/iris.data", flower_truth) print x, y prob = svm.svm_problem(y, x) param = svm.svm_parameter('-t 0 -c 4 -b 1') m = svmutil.svm_train(prob, param) p = svmutil.svm_predict(y, x, m) print "DONE"
def rank(self, pos_ids, neg_ids=()): """ Rank the current model, returning a mapping of element IDs to a ranking valuation. This valuation should be a probability in the range of [0, 1], where 1.0 is the highest rank and 0.0 is the lowest rank. :raises RuntimeError: No current model. :return: Mapping of ingest ID to a rank. :rtype: dict of (int, float) :param pos_ids: List of positive data IDs. Required. :type pos_ids: list of int :param neg_ids: List of negative data IDs. Optional. :type neg_ids: list of int :return: Mapping of ingest ID to a rank. :rtype: dict of (int, float) """ if not self.has_model(): raise RuntimeError("No model available for this indexer.") # Automatically support the negative IDs with the most distance UIDs # from the provided positive UIDs. # if len(neg_ids) == 0: # neg_ids = self._pick_auto_negatives(pos_ids) neg_ids = set(neg_ids).union(self._pick_auto_negatives(pos_ids)) # # SVM model training # uid_list = sorted(set.union(set(pos_ids), neg_ids)) feature_len = self._feature_mat.shape[1] # positive label: 1, negative label: 0 bool2label = {1: 1, 0: 0} labels = [bool2label[uid in pos_ids] for uid in uid_list] train_features = \ self._feature_mat[list(self._uid2idx_map[uid] for uid in uid_list), :] self.log.debug("Creating SVM problem") svm_problem = svm.svm_problem(labels, train_features.tolist()) self.log.debug("Creating SVM model") w1_weight = max(1.0, len(neg_ids)/float(len(pos_ids))) svm_model = svmutil.svm_train(svm_problem, self.svm_train_params % w1_weight) if svm_model.l == 0: raise RuntimeError("SVM Model learning failed") # Finding associated clip IDs of trained support vectors self.log.debug("Finding clip IDs for support vectors") hash2feature_idx = dict([(hash(tuple(f)), r) for r, f in enumerate(self._feature_mat)]) svm_sv_idxs = [] tmp_list = [0] * feature_len for r in range(svm_model.nSV[0] + svm_model.nSV[1]): for c in range(feature_len): tmp_list[c] = svm_model.SV[r][c].value svm_sv_idxs.append(hash2feature_idx[hash(tuple(tmp_list))]) # # Platt Scaling for probability ranking # # Features associated to support vectors in trained model self.log.debug("Forming data for Platt Scaling") # We need the distances between support vectors to all features test_kernel = self._distance_mat[svm_sv_idxs, :] weights = numpy.array(svm_model.get_sv_coef()).flatten() margins = (numpy.mat(weights) * test_kernel).A[0] self.log.debug("Performing Platt scaling") rho = svm_model.rho[0] probA = svm_model.probA[0] probB = svm_model.probB[0] #: :type: numpy.core.multiarray.ndarray probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB)) # Test if the probability of an adjudicated positive is below a # threshold. If it is, invert probabilities. # * Find lowest ranking positive example # * Test if the probability valuation falls in the lower 50% of all # probabilities. pos_probs = numpy.array( [probs[self._uid2idx_map[uid]] for uid in pos_ids] ) pos_mean_prob = pos_probs.sum() / pos_probs.size total_mean_prob = probs.sum() / probs.size if pos_mean_prob < total_mean_prob: probs = 1.0 - probs probability_map = dict(zip(self._uid_array, probs)) return probability_map
from pybrain.datasets import ClassificationDataSet print "Reading data set..." DS = ClassificationDataSet.loadFromFile('dataset.csv') #Split validation set TestDS, TrainDS = DS.splitWithProportion(0.25) #train svm from svm import svm_problem, svm_parameter, libsvm, gen_svm_nodearray #define problem with data from the pybrain dataset. # best python explanation for libsvm is here: https://github.com/arnaudsj/libsvm/tree/master/python #we have to convert the data to ints and lists because of the low-level c interface prob = svm_problem([int(t) for t in TrainDS['target']], [list(i) for i in TrainDS['input']]) param = svm_parameter() # option: -t 0: linear kernel. Best for classification. # option: -c 0.01: regularization parameter. smaller is more regularization # see below for all options param.parse_options('-t 0 -c 0.01') print "Training svm..." model = libsvm.svm_train(prob, param) print "Testing svm with three random inputs" from random import randrange for j in range(3): i = randrange(0, len(TestDS)) #again some conversion needed because of low level interface x0, m_idx = gen_svm_nodearray(list(TestDS['input'][i])) prediction = libsvm.svm_predict(model, x0)
def train(self, features, labels): assert isinstance(labels, np.ndarray), "labels should be numpy array" features = self._cleanse_features(features) problem = svm.svm_problem(labels.tolist(), features) self.model = svm.svm_model(problem, self._svm_parameter)
def _train(self, class_examples, **extra_params): """ Internal method that trains the classifier implementation. This method is called after checking that there is not already a model trained, thus it can be assumed that no model currently exists. The class labels will have already been checked before entering this method, so it can be assumed that the ``class_examples`` will container at least two classes. :param class_examples: Dictionary mapping class labels to iterables of DescriptorElement training examples. :type class_examples: dict[collections.abc.Hashable, collections.abc.Iterable[smqtk.representation.DescriptorElement]] :param extra_params: Dictionary with extra parameters for training. This is not used by this implementation. :type extra_params: None | dict[basestring, object] """ # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.abc.Sequence): self._log.debug(' (expanding iterable into sequence)') g = tuple(g) train_group_sizes.append(float(len(g))) x = numpy.array(DescriptorElement.get_many_vectors(g)) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count mismatch between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights if set to C-SVC type SVM if '-s' not in params or int(params['-s']) == 0: # (john.moeller): The weighting should probably be the geometric # mean of the number of examples over the classes divided by the # number of examples for the current class. gmean = scipy.stats.gmean(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): w = gmean / n params['-w' + str(i)] = w self._log.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) del train_vectors self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_elem and self.svm_label_map_elem.writable(): self._log.debug("saving labels to element (%s)", self.svm_label_map_elem) self.svm_label_map_elem.set_bytes( pickle.dumps(self.svm_label_map, -1) ) if self.svm_model_elem and self.svm_model_elem.writable(): self._log.debug("saving model to element (%s)", self.svm_model_elem) # LibSvm I/O only works with filepaths, thus the need for an # intermediate temporary file. fd, fp = tempfile.mkstemp() try: svmutil.svm_save_model(fp, self.svm_model) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self.svm_model_elem.set_bytes(f.read()) finally: os.remove(fp)
def result(model, text_list, label_list, NFR, ratio, add_name, string, Normalization=False): global pre_sum, rec_sum, f1_sum SKF = StratifiedKFold(n_splits=5, shuffle=True) count = 0 load_model = 'NO' if Normalization: load_model = 'word2vec' for tra_index, te_index in SKF.split(text_list, label_list): count += 1 train_text, test_text = [], [] train_label, test_label = [], [] loop(index=tra_index, X=text_list, y=label_list, textlist=train_text, labellist=train_label) loop(index=te_index, X=text_list, y=label_list, textlist=test_text, labellist=test_label) train = [] test = [] for j in range(len(train_text)): # data_train ▶︎ train train.append(train_text[j][1]) word_count(model=model, NFR=NFR, text=train, labels=train_label, ratio=ratio, add_name=add_name, string=string, count=count, load_model=load_model) for k in range(len(test_text)): # data_test ▶︎ test test.append(test_text[k][1]) dense_all_test = [] dense_all_train = [] dictionary = corpora.Dictionary(train) if Normalization == False: docs_train = train docs_test = test siki = 9999 # elif Normalization == True: # w1 = wordVecMaker(tokens=train, threshold=siki, nfr=NFR, count=count, classify=classification_model, path=add_path) # docs_train = w1.synonimTransfer(sentences=train, synonyms=w1.get_synonym()) # # w2 = get_synonym(test, siki) # docs_test = w1.synonimTransfer(sentences=test, synonyms=w1.get_synonym()) bow_corpus_train = [dictionary.doc2bow(d) for d in docs_train] bow_corpus_test = [dictionary.doc2bow(d) for d in docs_test] for bow in bow_corpus_train: dense = list( matutils.corpus2dense([bow], num_terms=len(dictionary)).T[0]) dense_all_train.append(dense) for bow2 in bow_corpus_test: dense2 = list( matutils.corpus2dense([bow2], num_terms=len(dictionary)).T[0]) dense_all_test.append(dense2) if model == 'SMO': prob = svm_problem(train_label, dense_all_train) param = svm_parameter("-s 0 -t 0") mdl = svmutil.svm_train(prob, param) label_predict, accuracy, dec_values = svmutil.svm_predict( test_label, dense_all_test, mdl) elif model in modelselection: clf = modelselection[model] clf.fit(dense_all_train, train_label) label_predict = clf.predict(dense_all_test) pre_score_ = precision_score(test_label, label_predict, average=None) rec_score_ = recall_score(test_label, label_predict, average=None) f1_score_ = f1_score(test_label, label_predict, average=None) pre_sum += pre_score_[1] rec_sum += rec_score_[1] f1_sum += f1_score_[1] df = pd.DataFrame({ '要件': test, '正解': test_label, '予測': label_predict }, columns=['要件', '正解', '予測']) dir_path = '実験/NFR分類/予測結果/' + model + '_10/' + load_model + '/' + add_name + '/' + NFR + '/' + str( ratio) + '/' if not os.path.exists(dir_path): os.makedirs(dir_path) df.to_csv(dir_path + '/' + str(siki) + '_' + str(count) + '(厚[' + string + ']).csv')
def train(self, positive_classes, negatives): """ Train the supervised SVM classifier model. The class label ``negative`` is reserved for the negative class. If a model is already loaded, we will raise an exception in order to prevent accidental overwrite. NOTE: This abstract method provides generalized error checking and should be called via ``super`` in implementing methods. :param positive_classes: Dictionary mapping positive class labels to iterables of DescriptorElement training examples. :type positive_classes: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param negatives: Iterable of negative DescriptorElement examples. :type negatives: collections.Iterable[smqtk.representation.DescriptorElement] :raises ValueError: The ``negative`` label was found in the ``positive_classes`` dictionary. This is reserved for the negative example class. :raises ValueError: There were no positive or negative examples. :raises RuntimeError: A model already exists in this instance.Following through with training would overwrite this model. Throwing an exception for information protection. """ super(LibSvmClassifier, self).train(positive_classes, negatives) # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {"-q": ""} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(positive_classes), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug("-- class %d (%s)", i, l) # requires a sequence, so making the iterable ``g`` a tuple g = positive_classes[l] if not isinstance(g, collections.Sequence): g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x self._log.debug("-- negatives (-1)") # Map integer SVM label to semantic label self.svm_label_map[-1] = self.NEGATIVE_LABEL # requires a sequence, so making the iterable ``negatives`` a tuple if not isinstance(negatives, collections.Sequence): negatives = tuple(negatives) negatives_size = float(len(negatives)) x = elements_to_matrix(negatives, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([-1] * x.shape[0]) train_vectors.extend(x.tolist()) del negatives, x self._log.debug( "Training elements: %d labels, %d vectors " "(should be the same)", len(train_labels), len(train_vectors) ) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Only need to calculate positive class weights when C-SVC type if "-s" not in params or int(params["-s"]) == 0: for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): params["-w" + str(i)] = max(1.0, negatives_size / float(n)) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_fp: self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp) with open(self.svm_label_map_fp, "wb") as f: cPickle.dump(self.svm_label_map, f) if self.svm_model_fp: self._log.debug("saving file -- model -- %s", self.svm_model_fp) svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
def main(args): paramsfn = args[0] exec(open(paramsfn,'r').read()) if len(args) > 1: crange = [float(args[1])] gammarange = [float(args[2])] output,input,fieldnames,fold_inds = load_data(datafilename,use_specific_fold_inds) fold_start = [-1] if use_specific_fold_inds: unique_fold_ids = unique(fold_inds) row_inds = [] outputcopy = [] inputcopy = zeros([size(input,0),size(input,1)],dtype='float64') fold_start = [0] curind = 0 for ind in unique_fold_ids: row_inds = [i for i in xrange(len(fold_inds)) if fold_inds[i] == ind] inputcopy[curind:curind+len(row_inds),:] = input[row_inds,:] outputcopy.extend([output[i] for i in row_inds]) curind += len(row_inds) fold_start.append(fold_start[-1]+len(row_inds)) input = inputcopy output = outputcopy nf = len(fold_start)-1 if testdatafilename != '': output_test,input_test,fieldnames,fold_inds_test = load_data(testdatafilename,False) if doscale: maxinput = input.max(0); mininput = input.min(0); input = (input-mininput)/(maxinput-mininput) if testdatafilename != '': input_test = (input_test-mininput)/(maxinput-mininput) if savemodel: save_scale_data(datafilename+'_scales.dat',maxinput,mininput) if donormalize: means = input.mean(0) stds = sqrt(input.var(0)) input = (input-means)/stds if testdatafilename != '': input_test = (input_test-means)/stds if savemodel: save_zscore_data(datafilename+'_meansstdevs.dat',means,stds) if numcpus == 'auto': p = Pool() else: if numcpus == 1: p = '' else: p = Pool(numcpus) if choose_specific_features: for specific_selected_choice in specific_selected_features: inputfiltered = input[:,specific_selected_choice] with Timer(): if use_specific_fold_inds: results = mygrid.grid_classify_multi (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p,fold_start) else: results = mygrid.grid_classify_multi (crange,gammarange,output,[list(x) for x in inputfiltered],nf,useprob,timeout,p) param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) prob = svm.svm_problem(output, [list(x) for x in inputfiltered]) target = (c_double * prob.l)() fold_start_p = (c_int *len(fold_start))() for i in xrange(len(fold_start)): fold_start_p[i] = fold_start[i] libsvm.svm_cross_validation_labeltargets(prob, fold_start_p,param, nf, target) labels = unique(output) ACC,confusionmatrix = mygrid.evaluations_classify_multi(output, target,labels) probchance = 0 N = len(output) for i in xrange(len(labels)): nums_per_class_pred =sum(confusionmatrix[:,i]) probchance += (sum(confusionmatrix[:,i])*sum(confusionmatrix[i,:]))*1.0/(N*N) kappa = (ACC/100-probchance)*1.0/(1-probchance); print 'Optimal gamma = %g\nOptimal c = %g' % (results[-1],results[-2]) print 'Top CV ACC = %g' % (ACC) print 'Top CV kappa = %g' % (kappa) sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' print '|| ||', for i in xrange(len(labels)): print '%6d |' % labels[i], print ' ||' sys.stdout.write('||=====') for i in xrange(len(labels)): sys.stdout.write('=========') print '========||' for i in xrange(len(labels)): print '||%3d||' % labels[i], for j in xrange(len(labels)): print '%6g |' % confusionmatrix[i,j], print '%6g ||' % sum(confusionmatrix[i,:]) sys.stdout.write('||-----') for i in xrange(len(labels)): sys.stdout.write('---------') print '--------||' print '|| ||', for i in xrange(len(labels)): print '%6g |' % sum(confusionmatrix[:,i]), print '%6g ||' % N sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' if savemodel: param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in inputfiltered],param) svm_save_model(datafilename + '.model',m) if outputlog != '': fout = open(outputlog,'a') print >> fout, '=========================' print >> fout, datafilename print >> fout, doscale, donormalize, dopca, '(scale/norm/pca)' print >> fout, crange[0],crange[-1], gammarange[0], gammarange[-1], '(cs,gammas)' print >> fout, use_specific_fold_inds, nf, '(use specific folds, numfold)' print >> fout, 'SPECIFIC FIELDS:' print >> fout, specific_selected_choice if fieldnames != []: for i in specific_selected_choice: print >> fout, fieldnames[i], print >> fout print >> fout, 'train: ' print >> fout, ' ACC=%g,kappa=%g (g=%g,c=%g)' % (ACC,kappa,results[-1],results[-2]) fout.write(' =======') for i in xrange(len(labels)): fout.write('=========') print >> fout, '==========' print >> fout, ' || ||', for i in xrange(len(labels)): print >> fout, '%6d |' % labels[i], print >> fout, ' ||' fout.write(' ||=====') for i in xrange(len(labels)): fout.write('=========') print >> fout, '========||' for i in xrange(len(labels)): print >> fout, ' ||%3d||' % labels[i], for j in xrange(len(labels)): print >> fout, '%6g |' % confusionmatrix[i,j], print >> fout, '%6g ||' % sum(confusionmatrix[i,:]) fout.write(' ||-----') for i in xrange(len(labels)): fout.write('---------') print >> fout, '--------||' print >> fout, ' || ||', for i in xrange(len(labels)): print >> fout, '%6g |' % sum(confusionmatrix[:,i]), print >> fout, '%6g ||' % N fout.write(' =======') for i in xrange(len(labels)): fout.write('=========') print >> fout, '==========' fout.close() if testdatafilename != '': inputfiltered_test = input_test[:,specific_selected_choice] param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in inputfiltered],param) pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in inputfiltered_test],m,'-b %d' % (int(useprob))) labels = m.get_labels() ACC,confusionmatrix = mygrid.evaluations_classify_multi(output_test, pred_labels, labels) probchance = 0 N = len(output_test) for i in xrange(len(labels)): nums_per_class_pred =sum(confusionmatrix[:,i]) probchance += (sum(confusionmatrix[:,i])*sum(confusionmatrix[i,:]))*1.0/(N*N) kappa = (ACC/100-probchance)*1.0/(1-probchance); print 'Test optimized accuracy = %g' % (ACC) print 'Test optimized kappa = %g' % (kappa) sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' print '|| ||', for i in xrange(len(labels)): print '%6d |' % labels[i], print ' ||' sys.stdout.write('||=====') for i in xrange(len(labels)): sys.stdout.write('=========') print '========||' for i in xrange(len(labels)): print '||%3d||' % labels[i], for j in xrange(len(labels)): print '%6g |' % confusionmatrix[i,j], print '%6g ||' % sum(confusionmatrix[i,:]) sys.stdout.write('||-----') for i in xrange(len(labels)): sys.stdout.write('---------') print '--------||' print '|| ||', for i in xrange(len(labels)): print '%6g |' % sum(confusionmatrix[:,i]), print '%6g ||' % N sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' else: with Timer(): results = mygrid.grid_classify_multi (crange,gammarange,output,[list(x) for x in input],nf,useprob,timeout,p) param = svm.svm_parameter('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) prob = svm.svm_problem(output, [list(x) for x in input]) target = (c_double * prob.l)() labels = unique(output) print 'Optimal gamma = %g\nOptimal c = %g' % (results[-1],results[-2]) libsvm.svm_cross_validation_labeltargets(prob, param, nf, target) ACC,confusionmatrix = mygrid.evaluations_classify_multi(output, target,labels) print 'Top CV ACC = %g' % (ACC) sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' print '|| ||', for i in xrange(len(labels)): print '%6d |' % labels[i], print ' ||' sys.stdout.write('||=====') for i in xrange(len(labels)): sys.stdout.write('=========') print '========||' for i in xrange(len(labels)): print '||%3d||' % labels[i], for j in xrange(len(labels)): print '%6g |' % confusionmatrix[i,j], print '%6g ||' % sum(confusionmatrix[i,:]) sys.stdout.write('||-----') for i in xrange(len(labels)): sys.stdout.write('---------') print '--------||' print '|| ||', for i in xrange(len(labels)): print '%6g |' % sum(confusionmatrix[:,i]), print '%6g ||' % len(output) sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' del target if savemodel: param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in input],param) svm_save_model(datafilename+'.model',m) if testdatafilename != '': param = ('-c %g -g %g -b %d' % (results[-2],results[-1],int(useprob))) m = svm_train(output,[list(x) for x in input],param) pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(output_test,[list(x) for x in input_test],m,'-b %d' % (int(useprob))) labels = m.get_labels() ACC,confusionmatrix = mygrid.evaluations_classify_multi(output_test, pred_labels, labels) print 'Test optimized accuracy = %g' % (ACC) sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' print '|| ||', for i in xrange(len(labels)): print '%6d |' % labels[i], print ' ||' sys.stdout.write('||=====') for i in xrange(len(labels)): sys.stdout.write('=========') print '========||' for i in xrange(len(labels)): print '||%3d||' % labels[i], for j in xrange(len(labels)): print '%6g |' % confusionmatrix[i,j], print '%6g ||' % sum(confusionmatrix[i,:]) sys.stdout.write('||-----') for i in xrange(len(labels)): sys.stdout.write('---------') print '--------||' print '|| ||', for i in xrange(len(labels)): print '%6g |' % sum(confusionmatrix[:,i]), print '%6g ||' % len(output_test) sys.stdout.write('=======') for i in xrange(len(labels)): sys.stdout.write('=========') print '==========' if outputlog != '': fout = open(outputlog,'a') print >> fout, results#[:-1] # for key in results[-1].keys(): # print >> fout, key, results[-1][key] fout.close()
for i, fileName in enumerate( ['./dataset2/avon.csv', './dataset2/brian_merge.csv', './dataset2/mon_merge.csv', './dataset2/nofar_merge.csv'] ): tmp = readDataset(fileName) # array of Instance dataSet = dataSet + tmp print 'size:', len(tmp) label = label + [i]*len(tmp) dataSet, label = shuffle(dataSet, label, random_state=0) cutIndex = int(TRAIN_SET_RATIO*len(dataSet)) ## use accel_abs and alpha_abs as input for encoding respectively print 'learning dictionary' data_accel = [I.accel_abs() for I in dataSet] data_alpha = [I.alpha_abs() for I in dataSet] RPDictionary_accel = Dictionary(PATCH_SIZE, data_accel[:cutIndex]) RPDictionary_alpha = Dictionary(PATCH_SIZE, data_alpha[:cutIndex]) aggregate_feature = [ f[0]+f[1] for f in zip( RPDictionary_accel.encoding(data_accel), RPDictionary_alpha.encoding(data_alpha) ) ] #aggregate_feature = preprocessing.scale(aggregate_feature) ## scale columns independently to have zero mean and unit variance writeFeature('./svm_train', aggregate_feature[:cutIndex], label[:cutIndex]) writeFeature('./svm_test', aggregate_feature[cutIndex:], label[cutIndex:]) ## SVM training X_train, Y_train = readFeature('./svm_train',PATCH_SIZE*2) prob = svm_problem(Y_train, X_train) param = svm_parameter('-t 1 -q -d 2') model = svm_train(prob, param) ## SVM predicting X_test, Y_test = readFeature('./svm_test',PATCH_SIZE*2) p_labels, p_acc, p_vals = svm_predict(Y_test, X_test, model) print p_acc print confusion_matrix(Y_test, p_labels)
#!/usr/bin/python # encoding: utf-8 import svm import svmutil print 'test' y = [13, 9, 7] x = [[1,1,1],[1,0,1],[1,1,0]] #データ prob = svm.svm_problem(y, x)#訓練データ param = svm.svm_parameter() param.kernel_type = svm.LINEAR param.C = 10 m = svmutil.svm_train(prob, param)#学習 #新しいクラスに対する予測 #res = svmutil.svm_predict([1], [[1,1,1]], m) res = svmutil.svm_predict([3],[[1,0,0]], m) print res