def train(self, features, grades): """Train a rank_svm on the specified features and grades. train_rank_svm(self, features, grades): features - numpy array/matrix with one row per essay grades - vector with one entry per essay """ self.min_grade = min(grades) self.max_grade = max(grades) num_essays, num_features = features.shape # Convert data into svmlight format [(label, [(feature, value), ...], query_id), ...] training_data = [] for essay_ind,grade in enumerate(grades): feature_list = [(feat_ind+1,feat_val) for feat_ind,feat_val in enumerate(features[essay_ind,:])] training_data.append((grade, feature_list, 1)) self.model = svmlight.learn(training_data, type='ranking', verbosity=0, C=100) grade_counts = {} for grade in grades: if grade not in grade_counts: grade_counts[grade] = 0 grade_counts[grade] += 1 self.grade_probs = dict([(grade, count/float(num_essays)) for grade,count in grade_counts.iteritems()]) scores = self.classify_rank_svm(features) self.curve = Curve(scores, probs=self.grade_probs)
def _train_classifier(self): print "Training classifier..." docs = [] # list of strings for w in self.pos_sites: docs.extend([p.get_text(self.text_type) for p in w]) for w in self.neg_sites: docs.extend([p.get_text(self.text_type) for p in w]) self.vectorizer.fit(docs) print self.vectorizer.vocabulary_ pos = np.array([ w.get_vsm(self.vectorizer, self.text_type) for w in self.pos_sites ]) pos = self._convert_to_svmlight(pos, 1) neg = np.array([ w.get_vsm(self.vectorizer, self.text_type) for w in self.neg_sites ]) neg = self._convert_to_svmlight(neg, -1) train = pos + neg print "Number of pos: ", len(pos) print "Number of neg: ", len(neg) #self.clf = svmlight.learn(train, type='classification', verbosity=0, cost_ratio=0.10, C=10) self.clf = svmlight.learn(train, type='classification', C=0.01, cost_ratio=2.0, verbosity=0)
def _classify(data, cond_info): "Runs a single classification" (n_runs, n_blocks) = cond_info.shape acc = np.empty((n_runs)) acc.fill(np.NAN) for i_test_run in xrange(n_runs): # exclude the test run from the training set i_train = np.setdiff1d(np.arange(n_runs), [i_test_run]) train_data = _format_data(data[:, i_train, :], cond_info[i_train, :]) model = svmlight.learn(train_data, type="classification", kernel="linear") test_data = _format_data(data[:, [i_test_run], :], cond_info[[i_test_run], :]) pred = svmlight.classify(model, test_data) f_acc = (float((np.sign(pred) == cond_info[i_test_run, :]).sum()) / len(pred) * 100.0) acc[i_test_run] = f_acc assert np.sum(np.isnan(acc)) == 0 return np.mean(acc)
def main_svmlight(): # copied: import svmlight import pdb training_data = syntheticData(30, 1) test_data = syntheticData(30, 1) #training_data = __import__('data').train0 #test_data = __import__('data').test0 print 'HERE 0' print 'training_data is', training_data print 'test_data is', test_data # train a model based on the data #pdb.set_trace() print 'HERE 1' model = svmlight.learn(training_data, type='regression', kernelType=2, verbosity=3) print 'HERE 2' # model data can be stored in the same format SVM-Light uses, for interoperability # with the binaries. svmlight.write_model(model, 'my_model.dat') print 'HERE 3' # classify the test data. this function returns a list of numbers, which represent # the classifications. #predictions = svmlight.classify(model, test_data) pdb.set_trace() predictions = svmlight.classify(model, training_data) print 'HERE 4' for p,example in zip(predictions, test_data): print 'pred %.8f, actual %.8f' % (p, example[0])
def training_model(ind, n=3): print "Loading features" load_features(n, fmap) print "Feature map size: %s" % fmap.getSize() print "Getting training data" train = [] for i in ind.get_pos_train_ind(): item = os.listdir("pos")[i] train.append( (1, [(fmap.getID(item[0]), item[1]) for item in ngrams.ngrams(n, open("pos/" + item).read()).items() if fmap.hasFeature(item[0])])) for i in ind.get_neg_train_ind(): item = os.listdir("neg")[i] train.append((-1, [ (fmap.getID(item[0]), item[1]) for item in ngrams.ngrams(n, open("neg/" + item).read()).items() if fmap.hasFeature(item[0]) ])) print "Training model" model = svmlight.learn(train, type='classification', verbosity=0) svmlight.write_model(model, 'my_model.dat') return model
def train(self, docs): pos_word_lists = [doc[2] for doc in docs if doc[0] == "POS"] neg_word_lists = [doc[2] for doc in docs if doc[0] == "NEG"] vocabulary = set(flatten(pos_word_lists) + flatten(neg_word_lists)) self.word_ids = {} a = [] for k in vocabulary: self.word_ids[k] = self.curr_id self.curr_id += 1 features = [] for wordlist in pos_word_lists: if self.presence: wordlist = set(wordlist) c = Counter(wordlist) featureVec = [(self.word_ids.get(word), v) for word, v in c.iteritems()] featureVec.sort(key=lambda x: x[0]) features.append((1, featureVec)) for wordlist in neg_word_lists: if self.presence: wordlist = set(wordlist) c = Counter(wordlist) featureVec = [(self.word_ids.get(word), v) for word, v in c.iteritems()] featureVec.sort(key=lambda x: x[0]) features.append((-1, featureVec)) self.model = svmlight.learn(features)
def train_multi_onevsall(self, x, y, unlab_x, strategy=1): num_classes = int(np.max(y) + 1) print x.shape, y.shape x, y = x[1:25000:2], y[1:25000:2] unlab_x = unlab_x[1:25000:1000] print "labelled points number:", x.shape[0] print "unlabelled points number:", unlab_x.shape[0] x_feat = self.svmlfeaturise(x) unlab_x_feat = self.svmlfeaturise(unlab_x) for i in xrange(num_classes): y_feat = (y==i)*2 - 1 feats = [] lab_feats = [] unlab_feats = [] for j in xrange(len(x_feat)): lab_feats.append((y_feat[j], x_feat[j])) # if unlab_x != None: # for j in xrange(len(unlab_x_feat)): # unlab_feats.append((0, unlab_x_feat[j])) feats = lab_feats + unlab_feats print "======SVM Model Training started=======" model = svmlight.learn(feats, type='classification', verbosity=0, kernel='rbf', C=self.C, rbf_gamma=self.gamma) print i print "======SVM Model Training terminated======" self.models.append(model) self.trained = True
def learnModel(self, X, y): dataList = self.__createData(X, y) self.model = svmlight.learn(dataList, type='ranking', verbosity=0, kernel=self.kernel, C=self.C, gamma=self.gamma)
def fit_binary(self, X, y): ''' Assume 'y' holds only 0 and 1. ''' label = np.copy(y) label[label<=0] = -1 train_data = self.toSvmlight(X, label) self.model = svmlight.learn(train_data)
def _train_with_values(self, dataset, poly_degree=2, C=100): self.svm_list = [] for month_ind in range(12): self._format_training_data(dataset, month_ind) if self.debug: print "Learning on month %d of 12 with %d samples..." %(month_ind+1, len(self.formatted_data)) self.svm_list.append( svmlight.learn(self.formatted_data, type='regression', kernel='polynomial', poly_degree=poly_degree, C=C, verbosity=0) )
def run_svm(article_count, feature_functions, kernel='polynomial', split=0.9, model_path='svm.model'): # https://bitbucket.org/wcauchois/pysvmlight articles, total_token_count = preprocess_wsj(article_count, feature_functions) dictionary = Dictionary() dictionary.add_one('ZZZZZ') # so that no features are labeled 0 data = [] for article in articles: for sentence in article: for tag, token_features in zip(sentence.def_tags, sentence.data): # only use def / indef tokens if tag in ('DEF', 'INDEF'): features = dictionary.add(token_features) features = sorted(list(set(features))) feature_values = zip(features, [1]*len(features)) data.append((+1 if tag == 'DEF' else -1, feature_values)) train, test = bifurcate(data, split, shuffle=True) # for corpus, name in [(train, 'train'), (test, 'test')]: # write_svm(corpus, 'wsj_svm-%s.data' % name) ##################### # do svm in Python... model = svmlight.learn(train, type='classification', kernel=kernel) # svmlight.learn options # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'. # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'. # verbosity: set the verbosity level (default 0). # C: trade-off between training error and margin. # poly_degree: parameter d in polynomial kernel. # rbf_gamma: parameter gamma in rbf kernel. # coef_lin # coef_const # costratio (corresponds to -j option to svm_learn) svmlight.write_model(model, model_path) gold_labels, test_feature_values = zip(*test) # total = len(gold_labels) test_pairs = [(0, feature_values) for feature_values in test_feature_values] predictions = svmlight.classify(model, test_pairs) correct, wrong = matches( [(gold > 0) for gold in gold_labels], [(prediction > 0) for prediction in predictions]) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel=kernel, correct=correct, wrong=wrong, total=correct + wrong, )
def runSVMLight(trainName,testName, kerneltype, c_param = 1.0, gamma_param = 1.0, verbosity = 0): """ converts data to python format only if not already in python format (files in python format are of type list, otherwise they are filenames) inputs: trainName, either the training data in svm-light format or the name of the training data file in LIBSVM/sparse format testName, either the test data in svm-light format or the name of the test data file in LIBSVM/sparse format kerneltype, (str)the type of kernel (linear, polynomial, sigmoid, rbf, custom) c_param, the C parameter (default 1) gamma_param, the gamma parameter (default 1) verbosity, 0, 1, or 2 for less or more information (default 0) outputs: (positiveAccuracy, negativeAccuracy, accuracy) """ if type(trainName) == list: trainingData = trainName else: trainingData = sparseToList(trainName) if type(testName) == list: testData = testName else: testData = sparseToList(testName) if verbosity == 2: print "Training svm......." # train a model based on the data model = svmlight.learn(trainingData, type='classification', verbosity=2, kernel=kerneltype, C = c_param, rbf_gamma = gamma_param ) # model data can be stored in the same format SVM-Light uses, for interoperability # with the binaries. # if type(trainName) == list: # svmlight.write_model(model, time.strftime('%Y-%m-%d-')+datetime.datetime.now().strftime('%H%M%S%f')+'_model.dat') # else: # svmlight.write_model(model, trainName[:-4]+'_model.dat') if verbosity == 2: print "Classifying........" # classify the test data. this function returns a list of numbers, which represent # the classifications. predictions = svmlight.classify(model, testData) # for p in predictions: # print '%.8f' % p correctLabels = correctLabelRemove(testData) # print 'Predictions:' # print predictions # print 'Correct Labels:' # print correctLabels return predictionCompare(predictions, correctLabels, verbosity)
def performSVMClassification(trainingData, testData): featureIndices = getFeatureIndices(trainingData + testData) formattedTrainingData = [] formattedTestData = [] for doc in trainingData: featureVector = [] features = {} for f in doc[2]: # if options["shouldUsePresence"]: # features[f] = 1 # else: # features[f] = 1 if f not in features else features[f] + 1 features[f] = 1 # Our baseline should use always use presence for k, v in features.items(): featureVector.append((featureIndices[k], v)) list.sort(featureVector, key=lambda x: x[0]) sentimentVal = 1 if doc[0] == "POS" else -1 formattedTrainingData.append((sentimentVal, featureVector)) model = svmlight.learn(formattedTrainingData) for doc in testData: featureVector = [] features = {} for f in doc[2]: # if options["shouldUsePresence"]: # features[f] = 1 # else: # features[f] = 1 if f not in features else features[f] + 1 features[f] = 1 # Our baseline should use always use presence for k, v in features.items(): featureVector.append((featureIndices[k], v)) list.sort(featureVector, key=lambda x: x[0]) formattedTestData.append((0, featureVector)) judgements = svmlight.classify(model, formattedTestData) formattedJudgements = [] i = 0 for (sentiment, fileName, features) in testData: formattedJudgements.append( ("POS" if judgements[i] > 0 else "NEG", sentiment, fileName)) i += 1 return formattedJudgements
def my_cross_val_score(data_fold, train, c_p): scores = [] for x, y in data_fold: data_x = collect_data_qid(x, train) data_y = collect_data_qid(y, train) model = SVC.learn(data_x, C=c_p, kernel='linear', type='ranking') pred = SVC.classify(model, data_y) scores.append(my_accus(data_y, pred)) return scores
def trainAndTest(training, test): #trainingNames = [x[0] for x in training] # never used, but might be someday trainingData = [d.dataTuple() for d in training] testNames = [d.name for d in test] testData = [d.dataTuple() for d in test] testLabels = [d.label for d in test] model = svmlight.learn(trainingData) predictions = svmlight.classify(model,testData) return zip(predictions, testLabels, testNames)
def tsvm_test0(): # data processing data, target = load_svmlight_file('dataset/following.scale') data, target = shuffle(data, target) target = binarize(target)[:,0] cutoff = int(round(data.shape[0] * 0.8)) train_data = data[:cutoff] train_target = target[:cutoff] transductive_train_data = data transductive_target = target.copy() transductive_target[cutoff:] = 0 test_data = data[cutoff:] test_target = target[cutoff:] # convert the data into svmlight format svm_train_data = npToSVMLightFormat(train_data, train_target) svm_transductive_train_data = npToSVMLightFormat(transductive_train_data, transductive_target) svm_test_data = npToSVMLightFormat(test_data, test_target) print 'labels in the training data' print countLabels(svm_transductive_train_data).most_common() # svmlight routine model = svmlight.learn(svm_train_data, j=3.0, kernel='linear', type='classification', verbosity=0) trans_model = svmlight.learn(svm_transductive_train_data, j=3.0, kernel='linear', type='classification', verbosity=0) predictions = svmlight.classify(model, svm_test_data) trans_predictions = svmlight.classify(trans_model, svm_test_data) print 'inductive learning' print accuracy(predictions, test_target) print '(recall, precision)', recall_precision(predictions, test_target) print 'transductive learning' print accuracy(trans_predictions, test_target) print '(recall, precision)', recall_precision(trans_predictions, test_target)
def predict(self, X): y = np.zeros(X.shape[0]).tolist() test_data = self.toSvmlight(X, y) all_data = self.train_data + test_data if self.class_dist: pos_ratio = self.class_dist[1] self.model = svmlight.learn(all_data, verbosity=1, transduction_posratio=pos_ratio) # self.model = svmlight.learn(all_data) else: self.model = svmlight.learn(self.train_data) predictions = np.array(svmlight.classify(self.model, test_data)) predictions[predictions > 0] = 1 predictions[predictions <= 0] = 0 # from collections import Counter # print Counter(predictions) return predictions
def train_svm(self, docs): # docs is a list of (pos/neg, filename, wordlist) svmlight_lines = [] for doc in docs: fv = list(enumerate(self.doc2vec_model.infer_vector(doc[2]), 1)) svmlight_line = (1 if doc[0] == "POS" else -1, fv) svmlight_lines.append(svmlight_line) self.svm_model = learn(svmlight_lines) print("*** SVM TRAINED ***")
def trainAndTest(training, test): #trainingNames = [x[0] for x in training] # never used, but might be someday trainingData = [(d[1],d[2]) for d in training] testNames = [d[0] for d in test] testData = [(d[1],d[2]) for d in test] testLabels = [d[1] for d in test] model = svm.learn(trainingData) predictions = svm.classify(model,testData) return zip(predictions, testLabels, testNames)
def five_fold_validation(training_sets, validation_sets, c_value): total_accuracy= 0.0 for i in range(len(training_sets)): model= svmlight.learn(training_sets[i], type='classification', C=c_value) classifications= svmlight.classify(model, validation_sets[i]) predictions= change_to_binary_predictions(classifications) accuracy= find_accuracy(validation_sets[i], predictions) total_accuracy += accuracy[0] return total_accuracy/len(training_sets)
def train(self): """Learn model weights from training instances.""" # Train using svmlight self._svmmodel = svmlight.learn(self._training_data, type='ranking') # Write svmlight output to a temp file and recover weights modelout = NamedTemporaryFile(delete=False) svmlight.write_model(self._svmmodel, modelout.name) modelout.close() self._recover_weights(modelout.name) remove(modelout.name)
def train(self, pos_word_lists, neg_word_lists): if self.stemming: porter_stemmer = PorterStemmer() pos_word_lists = [[porter_stemmer.stem(x) for x in l] for l in pos_word_lists] neg_word_lists = [[porter_stemmer.stem(x) for x in l] for l in neg_word_lists] if self.bigrams: if self.unigrams: neg_word_lists = [ zip(docwords, docwords[1:]) + docwords for docwords in neg_word_lists ] pos_word_lists = [ zip(docwords, docwords[1:]) + docwords for docwords in pos_word_lists ] else: neg_word_lists = [ zip(docwords, docwords[1:]) for docwords in neg_word_lists ] pos_word_lists = [ zip(docwords, docwords[1:]) for docwords in pos_word_lists ] vocabulary = set(flatten(pos_word_lists) + flatten(neg_word_lists)) self.word_ids = {} a = [] for k in vocabulary: self.word_ids[k] = self.curr_id self.curr_id += 1 features = [] for wordlist in pos_word_lists: if self.presence: wordlist = set(wordlist) c = Counter(wordlist) featureVec = [(self.word_ids.get(word), v) for word, v in c.iteritems()] featureVec.sort(key=lambda x: x[0]) features.append((1, featureVec)) for wordlist in neg_word_lists: if self.presence: wordlist = set(wordlist) c = Counter(wordlist) featureVec = [(self.word_ids.get(word), v) for word, v in c.iteritems()] featureVec.sort(key=lambda x: x[0]) features.append((-1, featureVec)) self.model = svmlight.learn(features)
def my_cross_val_score(data_fold, train, c_p): scores = [] for x, y in data_fold: data_x = collect_data_qid(x, train) data_y = collect_data_qid(y, train) model = SVC.learn(data_x, C=c_p, kernel='linear', type='ranking') pred = SVC.classify(model, data_y) scores.append( my_accus(data_y, pred) ) return scores
def trainall(): """ 使用svmè®ç»ƒ0-9 10个数å—æ ·æœ¬ :return: """ for i in range(10): print "training ", i training_data = totrain(i) model = svmlight.learn(training_data, type="classification", verbosity=0) model_name = "model/" + str(i) svmlight.write_model(model, model_name) # write model """
def svm(training_set, test_set): feature_indices = get_feature_indices(training_set + test_set) formatted_training_set = [] formatted_test_set = [] for (sentiment, file_name, features) in training_set: feature_vec = [] feature_freqs = {} for w in features: if options["usePresence"] or w not in feature_freqs: feature_freqs[w] = 1 else: feature_freqs[w] += 1 for word, count in feature_freqs.items(): feature_vec.append((feature_indices[word], count)) list.sort(feature_vec, key=lambda x: x[0]) sent_val = 1 if sentiment == "POS" else -1 formatted_training_set.append((sent_val, feature_vec)) model = svmlight.learn(formatted_training_set) for (sentiment, file_name, features) in test_set: feature_vec = [] feature_freqs = {} for w in features: if w not in feature_freqs: feature_freqs[w] = 1 else: feature_freqs[w] += 1 for word, count in feature_freqs.items(): feature_vec.append((feature_indices[word], count)) list.sort(feature_vec, key=lambda x: x[0]) formatted_test_set.append((0, feature_vec)) predictions = svmlight.classify(model, formatted_test_set) formatted_predictions = [] idx = 0 for (sentiment, file_name, features) in test_set: formatted_predictions.append( ("POS" if predictions[idx] > 0 else "NEG", sentiment, file_name)) idx += 1 return formatted_predictions
def train(featuresets): """ given a set of training instances in nltk format: [ ( {feature:value, ..}, str(label) ) ] train a support vector machine :param featuresets: training instances """ _raise_if_svmlight_is_missing() # build a unique list of labels labels = set() for (features, label) in featuresets: labels.add(label) # this is a binary classifier only if len(labels) > 2: raise ValueError('Can only do boolean classification (labels: ' + str(labels) + ')') return False # we need ordering, so a set's no good labels = list(labels) # next, assign -1 and 1 labelmapping = {labels[0]: -1, labels[1]: 1} # now for feature conversion # iter through instances, building a set of feature:type:str(value) triples svmfeatures = set() for (features, label) in featuresets: for k, v in compat.iteritems(features): svmfeatures.add(featurename(k, v)) # svmfeatures is indexable by integer svm feature number # svmfeatureindex is the inverse (svm feature name -> number) svmfeatures = list(svmfeatures) svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) # build svm feature set case by case svmfeatureset = [] for instance in featuresets: svmfeatureset.append( map_instance_to_svm(instance, labelmapping, svmfeatureindex)) # train the svm # TODO: implement passing of SVMlight parameters from train() to learn() return SvmClassifier( labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))
def train(fnames, topics): training_data = init_train_data(fnames, topics) print ('[ train ] ===================') with open(TRAINING_DATA, 'w') as f : pprint.pprint(training_data, f) # train a model based on the data model = svmlight.learn(training_data, type='ranking', kernel = 'linear', verbosity=0) # model data can be stored in the same format SVM-Light uses, for interoperability # with the binaries. svmlight.write_model(model, 'ef_model.dat') ZC.dump_cache()
def svm(train_docs, train_labels, test_docs, params): kernel, param = params train_docs_svm = to_svmlight_format(train_docs, (1 if l == 1 else -1 for l in train_labels)) test_docs_svm = to_svmlight_format(test_docs, np.zeros(test_docs.shape[0])) if kernel == 'rbf': model = svmlight.learn(train_docs_svm, type='classification', kernel='rbf', rbf_gamma=param) elif kernel == 'poly' or kernel == 'polynomial': model = svmlight.learn(train_docs_svm, type='classification', kernel='polynomial', poly_degree=param) else: raise ValueError('Unsupported svm parameters: ' + str(params)) margins = svmlight.classify(model, test_docs_svm) predict_labels = [1 if p > 0 else 0 for p in margins] return predict_labels
def SVM_experiment(data_fold, train, test, dumper): param = {'C': []} for i in range(-15, 15): param['C'].append(pow(2, i)) c_best = my_GridSearchCV(data_fold, train, param) dumper.write("Classifier: SVM\n") dumper.write('Best Parameters: %f' % (c_best)) model = SVC.learn(train, C=c_best, kernel='linear', type='ranking') ret = mySVM(model) pred = ranking_test(ret, test) output_ranking(pred, codecs.open('svm.ranking', 'w', 'utf-8')) return None
def SVM_experiment(data_fold, train, test, dumper): param = { 'C':[] } for i in range(-15, 15): param['C'].append(pow(2, i)) c_best = my_GridSearchCV(data_fold, train, param) dumper.write("Classifier: SVM\n") dumper.write('Best Parameters: %f'%(c_best)) model = SVC.learn(train, C=c_best, kernel='linear', type='ranking') ret = mySVM(model) pred = ranking_test(ret, test) output_ranking(pred, codecs.open('svm.ranking', 'w', 'utf-8')) return None
def find_models(examples, c_value): ''' For each class of example article, create a model. These models will be used to determine the liklihood that a test example comes from each class's source. ''' models= {} learned_classes= {} for example in examples: class_num = example[0] if class_num not in learned_classes: #print class_num train= change_to_binary_examples(examples, class_num) models[class_num]= svmlight.learn(train, type='classification', C=c_value) learned_classes[class_num] = 1 return models
def training_model(ind,n=3): print "Loading features" load_features(n,fmap) print "Feature map size: %s" % fmap.getSize() print "Getting training data" train = [] for i in ind.get_pos_train_ind(): item = os.listdir("pos")[i] train.append((1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("pos/"+item).read()).items() if fmap.hasFeature(item[0])])) for i in ind.get_neg_train_ind(): item = os.listdir("neg")[i] train.append((-1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("neg/"+item).read()).items() if fmap.hasFeature(item[0])])) print "Training model" model = svmlight.learn(train, type='classification', verbosity=0) svmlight.write_model(model, 'my_model.dat') return model
def train(fnames, topics): training_data = init_train_data(fnames, topics) print('[ train ] ===================') with open(TRAINING_DATA, 'w') as f: pprint.pprint(training_data, f) # train a model based on the data model = svmlight.learn(training_data, type='ranking', kernel='linear', verbosity=0) # model data can be stored in the same format SVM-Light uses, for interoperability # with the binaries. svmlight.write_model(model, 'ef_model.dat') ZC.dump_cache()
def train(featuresets): """ given a set of training instances in nltk format: [ ( {feature:value, ..}, str(label) ) ] train a support vector machine :param featuresets: training instances """ _raise_if_svmlight_is_missing() # build a unique list of labels labels = set() for (features, label) in featuresets: labels.add(label) # this is a binary classifier only if len(labels) > 2: raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')') return False # we need ordering, so a set's no good labels = list(labels) # next, assign -1 and 1 labelmapping = {labels[0]:-1, labels[1]:1} # now for feature conversion # iter through instances, building a set of feature:type:str(value) triples svmfeatures = set() for (features, label) in featuresets: for k,v in compat.iteritems(features): svmfeatures.add(featurename(k, v)) # svmfeatures is indexable by integer svm feature number # svmfeatureindex is the inverse (svm feature name -> number) svmfeatures = list(svmfeatures) svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) # build svm feature set case by case svmfeatureset = [] for instance in featuresets: svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex)) # train the svm # TODO: implement passing of SVMlight parameters from train() to learn() return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))
def fit(self, train_x, train_y, unlabeled_x=None): if self.rbf_gamma == 0: self.rbf_gamma = 1./train_x.shape[1] n_y = np.max(train_y)+1 self.models = [] feats = toSVMLightFeatures(train_x) if unlabeled_x != None: feats_unlabeled = toSVMLightFeatures(unlabeled_x) for i in range(n_y): train_y_binary = (train_y==i)*2-1 input = [] for i in range(len(feats)): input.append((train_y_binary[i], feats[i])) for i in range(len(feats_unlabeled)): input.append((0, feats_unlabeled[i])) _model = svmlight.learn(input, type='classification', kernel='rbf', C=self.C, rbf_gamma=self.rbf_gamma) self.models.append(_model) self.fitted = True
def train(self, label_values, converted=False): """Build a model. Args: label_values: Iterable of tuples of label and list-like objects Example: [(label, value), ...] or the result of using convert_label_values if converted=True. converted: If True then the input is in the correct internal format Returns: self """ if not converted: label_values = self.convert_label_values(label_values) if not isinstance(label_values, list): label_values = list(label_values) self._m = svmlight.learn(label_values, type='classification', verbosity=1) return self
def __makeModel(self): self.labelList = Labels() self.labelList.makeAllLabels(self.__imgdata.namelist) self.__models = list() for name in self.labelList.getLabellist(): label = self.labelList.name2label(name) traindata = list() for imageidx in range(len(self.__weights)): imageweights = self.__weights[imageidx] facelabel = self.labelList.name2label(self.__imgdata.namelist[imageidx].partition("_")[0]) if facelabel == label: example = 1 else: example = -1 traindata.append((example, self.__makeWeightTuplesList(imageweights))) temp_model = svmlight.learn(traindata, type='classification', verbosity=3) self.__models.append((name, temp_model))
def test_svmlight(): training_data = [(1, [(1, 2), (2, 5), (3, 6), (5, 1), (4, 2), (6, 1)]), (1, [(1, 2), (2, 1), (3, 4), (5, 3), (4, 1), (6, 1)]), (1, [(1, 2), (2, 2), (3, 4), (5, 1), (4, 1), (6, 1)]), (1, [(1, 2), (2, 1), (3, 3), (5, 1), (4, 1), (6, 1)]), (-1, [(1, 2), (2, 1), (3, 1), (5, 3), (4, 2), (6, 1)]), (-1, [(1, 1), (2, 1), (3, 1), (5, 3), (4, 1), (6, 1)]), (-1, [(1, 1), (2, 2), (3, 1), (5, 3), (4, 1), (6, 1)]), (-1, [(1, 1), (2, 1), (3, 1), (5, 1), (4, 3), (6, 1)]), (-1, [(1, 2), (2, 1), (3, 1), (5, 2), (4, 1), (6, 5)]), (-1, [(7, 10)])] test_data = [(0, [(1, 2), (2, 6), (3, 4), (5, 1), (4, 1), (6, 1)]), (0, [(1, 2), (2, 6), (3, 4)])] model = svmlight.learn(training_data, type='classification', verbosity=0) svmlight.write_model(model, 'my_model.dat') predictions = svmlight.classify(model, test_data) for p in predictions: print '%.8f' % p
def test_svmlight(): training_data = [(1, [(1,2),(2,5),(3,6),(5,1),(4,2),(6,1)]), (1, [(1,2),(2,1),(3,4),(5,3),(4,1),(6,1)]), (1, [(1,2),(2,2),(3,4),(5,1),(4,1),(6,1)]), (1, [(1,2),(2,1),(3,3),(5,1),(4,1),(6,1)]), (-1, [(1,2),(2,1),(3,1),(5,3),(4,2),(6,1)]), (-1, [(1,1),(2,1),(3,1),(5,3),(4,1),(6,1)]), (-1, [(1,1),(2,2),(3,1),(5,3),(4,1),(6,1)]), (-1, [(1,1),(2,1),(3,1),(5,1),(4,3),(6,1)]), (-1, [(1,2),(2,1),(3,1),(5,2),(4,1),(6,5)]), (-1, [(7,10)])] test_data = [(0, [(1,2),(2,6),(3,4),(5,1),(4,1),(6,1)]), (0, [(1,2),(2,6),(3,4)])] model = svmlight.learn(training_data, type='classification', verbosity=0) svmlight.write_model(model, 'my_model.dat') predictions = svmlight.classify(model, test_data) for p in predictions: print '%.8f' % p
def performDoc2VecJudgement(trainingData, testData): doc2vecModel = Doc2Vec.load("/Users/Matteo/Desktop/doc2vec_models/final_model") trainingFeatureVectors = [(1 if doc[0] == 'POS' else -1, doc2vecModel.infer_vector(doc[2])) for doc in trainingData] testFeatureVectors = [(0, doc2vecModel.infer_vector(doc[2])) for doc in testData] formattedTrainingFeatureVectors = [(v[0], [(i+1,f) for i,f in enumerate(v[1])]) for v in trainingFeatureVectors] formattedTestFeatureVectors = [(v[0], [(i+1,f) for i,f in enumerate(v[1])]) for v in testFeatureVectors] svmModel = svmlight.learn(formattedTrainingFeatureVectors) judgements = svmlight.classify(svmModel, formattedTestFeatureVectors) predictions = [] i = 0 for (sentiment, fileName, features) in testData: predictions.append((judgements[i], sentiment, fileName)) i += 1 return predictions
def fit(self, train_x, train_y, unlabeled_x=None): if self.rbf_gamma == 0: self.rbf_gamma = 1. / train_x.shape[1] n_y = np.max(train_y) + 1 self.models = [] feats = toSVMLightFeatures(train_x) if unlabeled_x != None: feats_unlabeled = toSVMLightFeatures(unlabeled_x) for i in range(n_y): train_y_binary = (train_y == i) * 2 - 1 input = [] for i in range(len(feats)): input.append((train_y_binary[i], feats[i])) for i in range(len(feats_unlabeled)): input.append((0, feats_unlabeled[i])) _model = svmlight.learn(input, type='classification', kernel='rbf', C=self.C, rbf_gamma=self.rbf_gamma) self.models.append(_model) self.fitted = True
def __makeModel(self): self.labelList = Labels() self.labelList.makeAllLabels(self.__imgdata.namelist) self.__models = list() for name in self.labelList.getLabellist(): label = self.labelList.name2label(name) traindata = list() for imageidx in range(len(self.__weights)): imageweights = self.__weights[imageidx] facelabel = self.labelList.name2label( self.__imgdata.namelist[imageidx].partition("_")[0]) if facelabel == label: example = 1 else: example = -1 traindata.append( (example, self.__makeWeightTuplesList(imageweights))) temp_model = svmlight.learn(traindata, type='classification', verbosity=3) self.__models.append((name, temp_model))
def fit(self, X, y, unlabeled_data=None): num_data = X.shape[0]+unlabeled_data.shape[0] num_unlabeled = unlabeled_data.shape[0] labeled = xrange(X.shape[0]) unlabeled = xrange(X.shape[0], num_data) if issparse(X): X = vstack((X, unlabeled_data), format='csr') else: X = np.concatenate((X, unlabeled_data)) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y_labeled = self._label_binarizer.fit_transform(y) self.num_classes_ = Y_labeled.shape[1] Y_unlabeled = np.zeros( (num_unlabeled, self.num_classes_,), dtype=np.float32) Y = np.zeros((num_data, self.num_classes_), dtype=np.float32) Y[labeled] = Y_labeled Y[unlabeled] = Y_unlabeled self.model_ = [] for i in xrange(self.num_classes_): y_column = Y[:, i] self.model_.append( svm.learn(self.__data2docs(X, y_column), type='classification'.encode()))
def main_svmlight(): # copied: import svmlight import pdb training_data = syntheticData(30, 1) test_data = syntheticData(30, 1) #training_data = __import__('data').train0 #test_data = __import__('data').test0 print 'HERE 0' print 'training_data is', training_data print 'test_data is', test_data # train a model based on the data #pdb.set_trace() print 'HERE 1' model = svmlight.learn(training_data, type='regression', kernelType=2, verbosity=3) print 'HERE 2' # model data can be stored in the same format SVM-Light uses, for interoperability # with the binaries. svmlight.write_model(model, 'my_model.dat') print 'HERE 3' # classify the test data. this function returns a list of numbers, which represent # the classifications. #predictions = svmlight.classify(model, test_data) pdb.set_trace() predictions = svmlight.classify(model, training_data) print 'HERE 4' for p, example in zip(predictions, test_data): print 'pred %.8f, actual %.8f' % (p, example[0])
nskipped = 1 if len(sentences) > 1: # because there have to be transitions docModel = DummyDocModel(sentences) grid = TextrazorEntityGrid(docModel.cleanSentences(), 1, textrazorEntities, textrazorSentences) if grid.valid and len(grid.matrixIndices) > 0: grid.printMatrix() featureVector = FeatureVector(grid, clusterIndex) featureVector.printVector() featureVector.printVectorWithIndices() vector = featureVector.getVector(qualityScore) featureVectors.append(vector) docIndex += 1 else: print "SKIPPING (not enough sentences) %s, nskipped=(%d)" % (fileName, nskipped) nskipped += 1 else: print "SKIPPING (no pickle file)%s, nskipped=(%d)" % (fileName, nskipped) nskipped += 1 # pickleFile = open("../cache/svmlightCache/featureVectors.pickle", 'wb') # pickle.dump(featureVectors, pickleFile, pickle.HIGHEST_PROTOCOL) # pickleFile.close() # if docIndex >= maxN: # break numDocsTried += 1 clusterIndex += 1 # now train on the data model = svmlight.learn(featureVectors, type='ranking', verbosity=0) svmlight.write_model(model, '../cache/svmlightCache/svmlightModel.dat')
def svm(): # load the sentiment score file with (word,pos) -> (posScore,negScore) dictionary # and the (review,sentiment) pair list synDict = pickle.load(open('sentiment_score.pickle','rb')) annot = pickle.load(open('sent_400_wspos.pickle','rb')) poscount = bothcount = 0 posTot = 0 bothTot = 0 print annot # 0 -> pos, 1 -> neg, 2 -> both, 3 -> neut data = {'pos':[],'neg':[],'both':[],'neut':[]} strToNum = {'pos':0,'neg':1,'both':2,'neut':3} for line,sent in annot: score = (0,0) hits = 0 # number of words found in dictionary, for scaling string = '' for word in line.split(): string += word.split('#')[0]+' ' neg = negate.negating(string.strip(' ')) # catch empty case, simpler than re-pickling if neg == []: continue # calculate (posScore, negScore) for each word in line for i,word in enumerate(line.split()): tri = word.split('#') tempscore = (0,0) if len(tri) == 3: hits += 1 pair = (tri[0]+'#'+tri[2],tri[1]) tempscore = synDict.get(pair,(0,0)) if 'NOT' in neg[i]: tempscore = (tempscore[1],tempscore[0]) # set to reverse value b/c inverted meaning score = (score[0]+tempscore[0],score[1]+tempscore[1]) # add tempscore to score data[sent.strip(' ')].append(score) featList = [] # convert to feature lists for key in data.keys(): featList.append(map(lambda (a,b): (strToNum[key],[(1,a),(2,b)]),data[key])) # construct test and train sets as fractions of featList train = featList[0][:3*len(featList[0])/4]+featList[1][:3*len(featList[1])/4]+featList[2][:3*len(featList[2])/4]+featList[3][:3*len(featList[3])/4] test = featList[0][3*len(featList[0])/4:]+featList[1][3*len(featList[1])/4:]+featList[2][3*len(featList[2])/4:]+featList[3][3*len(featList[3])/4:] for element in train: print element # train and test model model = svmlight.learn(train, type='classification', verbosity=0) svmlight.write_model(model, 'my_model1.dat') predictions = svmlight.classify(model, test) for p in predictions: #print '%.8f' % p pass
def run_svm(article_count, feature_functions, kernel='polynomial', split=0.9, model_path='svm.model'): # https://bitbucket.org/wcauchois/pysvmlight articles, total_token_count = preprocess_wsj(article_count, feature_functions) dictionary = Dictionary() dictionary.add_one('ZZZZZ') # so that no features are labeled 0 data = [] for article in articles: for sentence in article: for tag, token_features in zip(sentence.def_tags, sentence.data): # only use def / indef tokens if tag in ('DEF', 'INDEF'): features = dictionary.add(token_features) features = sorted(list(set(features))) feature_values = zip(features, [1] * len(features)) data.append((+1 if tag == 'DEF' else -1, feature_values)) train, test = bifurcate(data, split, shuffle=True) # for corpus, name in [(train, 'train'), (test, 'test')]: # write_svm(corpus, 'wsj_svm-%s.data' % name) ##################### # do svm in Python... model = svmlight.learn(train, type='classification', kernel=kernel) # svmlight.learn options # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'. # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'. # verbosity: set the verbosity level (default 0). # C: trade-off between training error and margin. # poly_degree: parameter d in polynomial kernel. # rbf_gamma: parameter gamma in rbf kernel. # coef_lin # coef_const # costratio (corresponds to -j option to svm_learn) svmlight.write_model(model, model_path) gold_labels, test_feature_values = zip(*test) # total = len(gold_labels) test_pairs = [(0, feature_values) for feature_values in test_feature_values] predictions = svmlight.classify(model, test_pairs) correct, wrong = matches([(gold > 0) for gold in gold_labels], [(prediction > 0) for prediction in predictions]) return dict( total_articles_count=len(articles), # int total_token_count=total_token_count, # int train_count=len(train), # int test_count=len(test), # int kernel=kernel, correct=correct, wrong=wrong, total=correct + wrong, )
def fit(self, data, target): svm_train_data = npToSVMLightFormat(data, target) model = svmlight.learn(svm_train_data, j=1.0, kernel='linear', type='classification', verbosity=0) self.model = model
for d, line in enumerate(f): line = line.strip().split(',') words = [int(x.split(':')[0]) + 1 for x in line[1:]] sort = np.argsort(words) counts = [int(x.split(':')[1]) for x in line[1:]] text.append(zip(np.array(words)[sort], np.array(counts)[sort])) text = np.array(text) def save_classifier(clf, clf_i): directory = args.output_directory if not os.path.exists(directory): os.makedirs(directory) svmlight.write_model(clf, os.path.join(directory, str(clf_i))) ######################################### ## Train SVM classifiers classifiers = {} for i in range(I): if i != root and len(inv_codes[i]) > 0: print i, 'of', I clf_child = -np.ones(D, np.int) clf_child[np.array(inv_codes[i])] = 1 clf_text = text[np.array(inv_codes[CtoP[i]])] clf_code = clf_child[np.array(inv_codes[CtoP[i]])] training = zip(clf_code, clf_text) clf = svmlight.learn(training, type='classification') save_classifier(clf, i)
def train_binary(self, x, y): train_data_svml = svmlfeaturisexy(x, y) model = svmlight.learn(train_data_svml, type='classification', verbosity=0, kernel='rbf', C=self.C, rbf_gamma=self.gamma) svmlight.write_model(model, 'tsvm_mnist.dat')
return prepared_data folds = get_n_folds(10) for i, test_fold in enumerate(folds): print("Fold: " + str(i)) training = merge_all_folds_except(i, folds) features_mapping = get_features(training) svm_train = prepare_data_for_svm(training, features_mapping) print("Train Ratio = " + str(get_neg_proportion(training))) print("Test Ratio = " + str(get_neg_proportion(test_fold))) print(len(training)) print(len(test_fold)) model = svmlight.learn(svm_train, type='classification') svm_test = prepare_data_for_svm(test_fold, features_mapping) svm_test_with_unknown_class = [(0, features) for _, features in svm_test] # predictions = svmlight.classify(model, svm_test_with_unknown_class) pos_count = 0 # for i,p in enumerate(predictions): # truth = svm_test[i][0] # if truth*p > 0: # print("Correct: %.8f" % p) # pos_count += 1 # else : # print("Incorre: %.8f" % p) # print(pos_count) # model data can be stored in the same format SVM-Light uses, for
val = int(float(counter) / len(filenames) * 100) if val in percentages and percentages[val]: print " Progress: %i %s" % (val, "%") percentages[val] = False try: source = open(directory + filename, 'r') train_type = int(source.readline()) train_num_dimensions = int(source.readline()) train_dimensions = source.readline().strip().split() source.close() num = 1 vals = [] for val in train_dimensions: vals.append((num, float(val))) num += 1 training_data.append((train_type, vals)) except Exception as e: print "ERROR:", e break counter += 1 print "Imported:", len(training_data), "\n" print "Building Model" model = svmlight.learn(training_data, type='classification', verbosity=0) print "Write Model" svmlight.write_model(model, 'svm-model.dat')
elif not standard: data_test = np.array(list(data_test)) nsamples, nx = data_test.shape data_test = data_test.reshape((nsamples, nx)) dump_svmlight_file(data_test, target_test, 'aux/test_' + ts + '.txt') train = svm_parse('aux/train_' + ts + '.txt') aux = svm_parse('aux/test_' + ts + '.txt') test, val = adapt_to_svmlight_format(aux) print("Training it=", it, "cost-factor=", cost_factor + 1) model = svmlight.learn(list(train), type='classification', verbosity=0, costratio=cost_factor + 1) ## costratio = cost-factor if dump == "yes": svmlight.write_model( model, "models/model_" + dataset + "_" + features + "_it" + str(it) + "_cost_fact" + str(cost_factor + 1) + "_" + ts + ".dat") predictions = svmlight.classify(model, test) print("Predicting it=", it, "cost-factor=", cost_factor + 1) tp, tn, fp, fn = evaluate(predictions) accuracies.append( weighted_accuracy(cost_factor + 1, tn, tp, fn, fp) * 100)