def load_predictions(self, questions): if not self.classifier: print "Classifier hasn't been trained yet" sys.exit() X = [] for question in questions: score_vector = self.get_answer(question) question_features = solver_utils.get_feature_vector(question) feature_vector = score_vector + question_features X.append(feature_vector) soft_predictions = self.classifier.predict_proba(X) for index, question in enumerate(questions): self.soft_cache[question.id] = soft_predictions[index]
def train_classifier(self, train_questions, prefix): self.load_answer_cache(train_questions, prefix) X, y = [], [] for question in train_questions: score_vector = self.get_answer(question) answer = np.argmax(score_vector) label = int(["A", "B", "C", "D"][answer] == question.correct_answer) question_features = solver_utils.get_feature_vector(question) feature_vector = score_vector + question_features X.append(feature_vector) y.append(label) clf = RandomForestClassifier(n_estimators=300, n_jobs=-1) calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=10) calibrated_clf.fit(X, y) self.classifier = calibrated_clf