def walk_classifier(name, data_fn, ckargs=None): evaluation_sets = {} # {year: [training_list, test_list, expected_list]} final_query_lines = [] # Build training data. year = date.today().year fn = data_fn print('Reading %s...' % fn) fn2 = spreadsheet_to_csv(fn) i = 0 for line in read_raw_csv(fn2): i += 1 print('line:', i, line) year = int(line['Election'].value) evaluation_sets.setdefault(year, [[], [], []]) if line['Won'].value != MISSING: # Add line to test set. test_line = copy.deepcopy(line) evaluation_sets[year][2].append(test_line['Won'].value) test_line['Won'].value = MISSING evaluation_sets[year][1].append(test_line) # Add line to all future sets. for other_year in evaluation_sets: if year < other_year: evaluation_sets[other_year][0].append(line) else: final_query_lines.append(line) accuracy = [] final_training_data = None final_year = None # Evaluate each evaluation set. #pprint(evaluation_sets, indent=4) print('%i evaluation_sets.' % len(evaluation_sets)) for year, data in sorted(evaluation_sets.items()): raw_training_data, raw_testing_data, prediction_values = data print('Evaluation set:', year, len(raw_training_data), len(raw_testing_data), len(prediction_values)) if not raw_training_data: print('No training data. Skipping.') continue # Create training set. training_data = ArffFile(relation='presidential-candidates') for _line in raw_training_data: training_data.append(_line) training_data.attribute_data['Won'].update([DEMOCRAT, REPUBLICAN]) training_data.write(open('training_data_%i.arff' % year, 'w')) if not raw_testing_data: final_training_data = training_data final_year = year print('No testing data. Skipping.') continue # Create query set. query_data = training_data.copy(schema_only=True) for _line in raw_testing_data: query_data.append(_line) query_data.write(open('query_data_%i.arff' % year, 'w')) # Train print('=' * 80) c = Classifier(name=name, ckargs=ckargs) print('Training...') c.train(training_data, verbose=True) # Test print('Predicting...') predictions = c.predict(query_data, verbose=True, distribution=True) print('predictions:') for predicted_value, actual_value in zip(predictions, prediction_values): print('predicted_value =', predicted_value, 'actual_value =', actual_value) accuracy.append(predicted_value.predicted == actual_value) print('-' * 80) accuracy_history = accuracy if accuracy: accuracy = sum(accuracy) / float(len(accuracy)) else: accuracy = None print('accuracy_history:', accuracy_history) print('accuracy:', accuracy) # Make final prediction. predicted_cls = None certainty = None if final_training_data: # Create final query set. query_data = final_training_data.copy(schema_only=True) for _line in final_query_lines: query_data.append(_line) query_data.write(open('query_data_%i.arff' % year, 'w')) # Train print('!' * 80) c = Classifier(name=name, ckargs=ckargs) print('Final Training...') c.train(final_training_data, verbose=True) # Test print('~' * 80) print('Final Predicting...') predictions = c.predict(query_data, verbose=True, distribution=True) print('final predictions:') for predicted_value in predictions: print('predicted_value:', predicted_value) with open('prediction_%i_%s.txt' % (year, name), 'w') as fout: print('stdout:', file=fout) print(c.last_training_stdout, file=fout) print(file=fout) print('stderr.begin:', file=fout) print(c.last_training_stderr, file=fout) print('stderr.end:', file=fout) print(file=fout) print('predicted_value.probability:', predicted_value.probability, file=fout) predicted_cls = predicted_value.predicted certainty = predicted_value.certainty else: raise Exception( 'No final training data! Are there no empty "won" columns?') return accuracy, predicted_cls, certainty
class WekaRandomForestClassifier(BaseEstimator, ClassifierMixin): def __init__(self, n_estimators=10, max_depth=None, max_features="auto", random_state=None): self.n_estimators = n_estimators self.max_depth = max_depth self.max_features = max_features self.random_state = random_state def fit(self, X, y): # Check params self.n_features_ = X.shape[1] random_state = check_random_state(self.random_state) if isinstance(self.max_features, str): if self.max_features == "auto": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) params = {} params["-I"] = self.n_estimators params["-K"] = max_features params["-depth"] = 0 if self.max_depth is None else self.max_depth params["-no-cv"] = None params["-s"] = random_state.randint(1000000) # Convert data self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) y = np.searchsorted(self.classes_, y) tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False) to_arff(X, y, self.n_classes_, tf) tf.close() # Run self.model_ = Classifier(name="weka.classifiers.trees.RandomForest", ckargs=params) self.model_.train(tf.name) os.remove(tf.name) return self def predict(self, X): tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False) to_arff(X, None, self.n_classes_, tf) tf.close() pred = np.zeros(len(X), dtype=np.int32) for i, r in enumerate(self.model_.predict(tf.name)): pred[i] = int(r.predicted[5]) os.remove(tf.name) return self.classes_[pred]
def test_IBk(self): # Train a classifier. print('Training IBk classifier...') c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K': 1}) training_fn = os.path.join(BP, 'fixtures/abalone-train.arff') c.train(training_fn, verbose=1) self.assertTrue(c._model_data) # Make a valid query. print('Using IBk classifier...') query_fn = os.path.join(BP, 'fixtures/abalone-query.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) pred0 = predictions[0] print('pred0:', pred0) pred1 = PredictionResult(actual=None, predicted=7, probability=None) print('pred1:', pred1) self.assertEqual(pred0, pred1) # Make a valid query. with self.assertRaises(PredictionError): query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) # Make a valid query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append( ['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?']) data_str0 = """% @relation test @attribute 'Sex' {F,I,M} @attribute 'Length' numeric @attribute 'Diameter' numeric @attribute 'Height' numeric @attribute 'Whole weight' numeric @attribute 'Shucked weight' numeric @attribute 'Viscera weight' numeric @attribute 'Shell weight' numeric @attribute 'Class_Rings' integer @data M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,? """ data_str1 = query.write(fmt=DENSE) # print(data_str0 # print(data_str1 self.assertEqual(data_str0, data_str1) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual( predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) # Test pickling. fn = os.path.join(BP, 'fixtures/IBk.pkl') c.save(fn) c = Classifier.load(fn) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual( predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) #print('Pickle verified.') # Make a valid dict query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append({ 'Sex': 'M', 'Length': 0.35, 'Diameter': 0.265, 'Height': 0.09, 'Whole weight': 0.2255, 'Shucked weight': 0.0995, 'Viscera weight': 0.0485, 'Shell weight': 0.07, 'Class_Rings': arff.MISSING, }) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual( predictions[0], PredictionResult(actual=None, predicted=7, probability=None))
class LRWrapper: #k is the number of features def __init__(self, C): self.k = 4 self.C = C self.trainingData = None self.testingData = None #jvm.start() #jvm.start(system_cp = True, packages = True) def retrain(self, examples, labels): f = open("trainingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(examples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(examples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.trainingData = loader.load_file("trainingweka.arff") self.trainingData.set_class_index(self.trainingData.num_attributes() - 1) self.classifier = Classifier( classname="weka.classifiers.functions.Logistic", options=["-R", "%f" % (1.0 / self.C)]) self.classifier.build_classifier(self.trainingData) #self.classifier = LogisticRegression(penalty = 'l2', C = self.C) #self.classifier = LogisticRegression() #self.classifier.fit(examples, labels) def predict(self, testExamples): return self.classifier.predict(testExamples) def getParams(self): return (self.classifier.coef_, self.classifier.intercept_) def score(self, testExamples, labels): f = open("testingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(testExamples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(testExamples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.testingData = loader.load_file("testingweka.arff") self.testingData.set_class_index(self.testingData.num_attributes() - 1) evaluation = Evaluation(self.trainingData) evaluation.test_model(self.classifier, self.testingData) #print evaluation.percent_correct() #jvm.stop() return evaluation.percent_correct() def fscore(self, testExamples, labels): return 0 predictions = self.predict(testExamples) precision = 0.0 precisionD = 0.000000001 recall = 0.0 recallD = 0.000000001 for (prediction, label) in zip(predictions, labels): if prediction == 1: if label == 1: precision += 1 precisionD += 1 if label == 1: if prediction == 1: recall += 1 recallD += 1 precision /= precisionD recall /= recallD return 2 * ((precision * recall) / (precision + recall + 0.000000001)) #distance to the hyperplane def getUncertainty(self, example): probs = self.classifier.predict_proba([example]) entropy = 0.0 for p in probs[0]: entropy += p * log(p + 0.0000001) entropy *= -1 return entropy def getAllUncertainties(self, examples): entropies = [] probs = self.classifier.predict_proba(examples) for prob in probs: entropy = 0.0 for p in prob: entropy += p * log(p + 0.0000001) #print "BOOP" #print p #print log(p) #print entropy entropy *= -1 entropies.append(entropy) return entropies def getMostUncertainTask(self, tasks, taskIndices): highestUncertainty = -21930123123 highestEntropyDistribution = None mostUncertainTaskIndices = [] mustUncertainTasks = [] entropies = self.getAllUncertainties(tasks) for (task, i, uncertainty) in zip(tasks, taskIndices, entropies): if uncertainty > highestUncertainty: mostUncertainTaskIndices = [i] mostUncertainTasks = [task] highestUncertainty = uncertainty elif uncertainty == highestUncertainty: mostUncertainTaskIndices.append(i) mostUncertainTasks.append(task) #(mostUncertainTaskIndex, # mostUncertainTask) = sample(zip(mostUncertainTaskIndices, # mostUncertainTasks), 1)[0] mostUncertainTaskIndex = mostUncertainTaskIndices[0] mostUncertainTask = mostUncertainTasks[0] return (self.classifier.predict_proba([mostUncertainTask])[0], mostUncertainTaskIndex) def getTotalUncertainty(self, examples): totalUncertainty = 0.0 for example in examples: #print "YO" #print self.getUncertainty(example) totalUncertainty += self.getUncertainty(example) totalUncertainty /= len(examples) #return max(self.getAllUncertainties(examples)) return totalUncertainty
def test_IBk(self): # Train a classifier. print('Training IBk classifier...') c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K':1}) training_fn = os.path.join(BP, 'fixtures/abalone-train.arff') c.train(training_fn, verbose=1) self.assertTrue(c._model_data) # Make a valid query. print('Using IBk classifier...') query_fn = os.path.join(BP, 'fixtures/abalone-query.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) pred0 = predictions[0] print('pred0:', pred0) pred1 = PredictionResult(actual=None, predicted=7, probability=None) print('pred1:', pred1) self.assertEqual(pred0, pred1) # Make a valid query. with self.assertRaises(PredictionError): query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) # Make a valid query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append(['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?']) data_str0 = """% @relation test @attribute 'Sex' {F,I,M} @attribute 'Length' numeric @attribute 'Diameter' numeric @attribute 'Height' numeric @attribute 'Whole weight' numeric @attribute 'Shucked weight' numeric @attribute 'Viscera weight' numeric @attribute 'Shell weight' numeric @attribute 'Class_Rings' integer @data M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,? """ data_str1 = query.write(fmt=DENSE) # print(data_str0 # print(data_str1 self.assertEqual(data_str0, data_str1) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual(predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) # Test pickling. fn = os.path.join(BP, 'fixtures/IBk.pkl') c.save(fn) c = Classifier.load(fn) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual(predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) #print('Pickle verified.') # Make a valid dict query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append({ 'Sex': 'M', 'Length': 0.35, 'Diameter': 0.265, 'Height': 0.09, 'Whole weight': 0.2255, 'Shucked weight': 0.0995, 'Viscera weight': 0.0485, 'Shell weight': 0.07, 'Class_Rings': arff.MISSING, }) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual(predictions[0], PredictionResult(actual=None, predicted=7, probability=None))