class RankSVM: """Python interface to rank svm in svm_light.""" def __init__(self, **kwargs): self.model = None def train(self, features, grades): """Train a rank_svm on the specified features and grades. train_rank_svm(self, features, grades): features - numpy array/matrix with one row per essay grades - vector with one entry per essay """ self.min_grade = min(grades) self.max_grade = max(grades) num_essays, num_features = features.shape # Convert data into svmlight format [(label, [(feature, value), ...], query_id), ...] training_data = [] for essay_ind,grade in enumerate(grades): feature_list = [(feat_ind+1,feat_val) for feat_ind,feat_val in enumerate(features[essay_ind,:])] training_data.append((grade, feature_list, 1)) self.model = svmlight.learn(training_data, type='ranking', verbosity=0, C=100) grade_counts = {} for grade in grades: if grade not in grade_counts: grade_counts[grade] = 0 grade_counts[grade] += 1 self.grade_probs = dict([(grade, count/float(num_essays)) for grade,count in grade_counts.iteritems()]) scores = self.classify_rank_svm(features) self.curve = Curve(scores, probs=self.grade_probs) def grade(self, features, options={}): scores = self.classify_rank_svm(features) return [self.curve.curve(score) for score in scores] def classify_rank_svm(self, features): """Run rank_svm to rank the specified essay features (numpy matrix/array). Returns a vector of scores of the specified essays.""" assert self.model is not None # Convert data into svmlight format [(label, [(feature, value), ...], query_id), ...] test_data = [] for essay_ind,feat_vec in enumerate(features): feature_list = [(feat_ind+1,feat_val) for feat_ind,feat_val in enumerate(feat_vec)] test_data.append((0, feature_list, 1)) return svmlight.classify(self.model, test_data)
class SVM: """Python interface to rank SVM c code.""" tmp_path = "/tmp" data_file = os.path.join(tmp_path, "svm_rank_features.dat") model_file = os.path.join(tmp_path, "svm_rank_model.dat") test_file = os.path.join(tmp_path, "svm_rank_test.dat") predictions_file = os.path.join(tmp_path, "svm_rank_predictions") os.system("cd learn && make > /dev/null") def __init__(self, **kwargs): pass def train(self, features, grades): """Train a rank_svm on the specified features and grades. train_rank_svm(self, features, grades): features - numpy array/matrix with one row per essay grades - vector with one entry per essay """ self.min_grade = min(grades) self.max_grade = max(grades) num_essays, num_features = features.shape f = open(SVM.data_file, "w") f.write("%d\t%d\n" % (num_essays, num_features)) for essay_ind in range(num_essays): feature_str = "\t".join([str(feat) for feat in features[essay_ind, :]]) f.write("%d\t%s\n" % (grades[essay_ind], feature_str)) f.close() os.system("learn/rank_svm %s %s > /dev/null" % (SVM.data_file, SVM.model_file)) # Set a curve based on the SVM ranking scores grade_counts = {} for grade in grades: if grade not in grade_counts: grade_counts[grade] = 0 grade_counts[grade] += 1 self.grade_probs = dict([(grade, count / float(num_essays)) for grade, count in grade_counts.iteritems()]) scores = self.classify_rank_svm(features) self.curve = Curve(scores, probs=self.grade_probs) def grade(self, features, options={}): scores = self.classify_rank_svm(features) return [self.curve.curve(score) for score in scores] def classify_rank_svm(self, features): """Run rank_svm to rank the specified essay features (numpy matrix/array). Returns a vector of scores of the specified essays.""" num_essays, num_features = features.shape f = open(SVM.test_file, "w") f.write("%d\t%d\n" % (num_essays, num_features)) for essay_ind in range(num_essays): feature_str = "\t".join([str(feat) for feat in features[essay_ind, :]]) f.write("0\t%s\n" % feature_str) f.close() os.system("learn/rank_svm %s %s %s > /dev/null" % (SVM.test_file, SVM.model_file, SVM.predictions_file)) f = open(SVM.predictions_file) scores = [float(score) for score in f.readlines()] f.close() return scores