def walk_classifier(name, data_fn, ckargs=None): evaluation_sets = {} # {year: [training_list, test_list, expected_list]} final_query_lines = [] # Build training data. year = date.today().year fn = data_fn print('Reading %s...' % fn) fn2 = spreadsheet_to_csv(fn) i = 0 for line in read_raw_csv(fn2): i += 1 print('line:', i, line) year = int(line['Election'].value) evaluation_sets.setdefault(year, [[], [], []]) if line['Won'].value != MISSING: # Add line to test set. test_line = copy.deepcopy(line) evaluation_sets[year][2].append(test_line['Won'].value) test_line['Won'].value = MISSING evaluation_sets[year][1].append(test_line) # Add line to all future sets. for other_year in evaluation_sets: if year < other_year: evaluation_sets[other_year][0].append(line) else: final_query_lines.append(line) accuracy = [] final_training_data = None final_year = None # Evaluate each evaluation set. #pprint(evaluation_sets, indent=4) print('%i evaluation_sets.' % len(evaluation_sets)) for year, data in sorted(evaluation_sets.items()): raw_training_data, raw_testing_data, prediction_values = data print('Evaluation set:', year, len(raw_training_data), len(raw_testing_data), len(prediction_values)) if not raw_training_data: print('No training data. Skipping.') continue # Create training set. training_data = ArffFile(relation='presidential-candidates') for _line in raw_training_data: training_data.append(_line) training_data.attribute_data['Won'].update([DEMOCRAT, REPUBLICAN]) training_data.write(open('training_data_%i.arff' % year, 'w')) if not raw_testing_data: final_training_data = training_data final_year = year print('No testing data. Skipping.') continue # Create query set. query_data = training_data.copy(schema_only=True) for _line in raw_testing_data: query_data.append(_line) query_data.write(open('query_data_%i.arff' % year, 'w')) # Train print('=' * 80) c = Classifier(name=name, ckargs=ckargs) print('Training...') c.train(training_data, verbose=True) # Test print('Predicting...') predictions = c.predict(query_data, verbose=True, distribution=True) print('predictions:') for predicted_value, actual_value in zip(predictions, prediction_values): print('predicted_value =', predicted_value, 'actual_value =', actual_value) accuracy.append(predicted_value.predicted == actual_value) print('-' * 80) accuracy_history = accuracy if accuracy: accuracy = sum(accuracy) / float(len(accuracy)) else: accuracy = None print('accuracy_history:', accuracy_history) print('accuracy:', accuracy) # Make final prediction. predicted_cls = None certainty = None if final_training_data: # Create final query set. query_data = final_training_data.copy(schema_only=True) for _line in final_query_lines: query_data.append(_line) query_data.write(open('query_data_%i.arff' % year, 'w')) # Train print('!' * 80) c = Classifier(name=name, ckargs=ckargs) print('Final Training...') c.train(final_training_data, verbose=True) # Test print('~' * 80) print('Final Predicting...') predictions = c.predict(query_data, verbose=True, distribution=True) print('final predictions:') for predicted_value in predictions: print('predicted_value:', predicted_value) with open('prediction_%i_%s.txt' % (year, name), 'w') as fout: print('stdout:', file=fout) print(c.last_training_stdout, file=fout) print(file=fout) print('stderr.begin:', file=fout) print(c.last_training_stderr, file=fout) print('stderr.end:', file=fout) print(file=fout) print('predicted_value.probability:', predicted_value.probability, file=fout) predicted_cls = predicted_value.predicted certainty = predicted_value.certainty else: raise Exception( 'No final training data! Are there no empty "won" columns?') return accuracy, predicted_cls, certainty
def train(self, trainingSet): global classifier classifier = Classifier(name='weka.classifiers.trees.J48', ckargs={'-x': 10}) classifier.train(trainingSet, trainingSet, verbose=0) print "Classifier is Trained"
def train(self, trainingSet): global classifier classifier = Classifier(name="weka.classifiers.trees.J48", ckargs={"-x": 10}) classifier.train(trainingSet, trainingSet, verbose=0) print "Classifier is Trained"
class WekaRandomForestClassifier(BaseEstimator, ClassifierMixin): def __init__(self, n_estimators=10, max_depth=None, max_features="auto", random_state=None): self.n_estimators = n_estimators self.max_depth = max_depth self.max_features = max_features self.random_state = random_state def fit(self, X, y): # Check params self.n_features_ = X.shape[1] random_state = check_random_state(self.random_state) if isinstance(self.max_features, str): if self.max_features == "auto": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) params = {} params["-I"] = self.n_estimators params["-K"] = max_features params["-depth"] = 0 if self.max_depth is None else self.max_depth params["-no-cv"] = None params["-s"] = random_state.randint(1000000) # Convert data self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) y = np.searchsorted(self.classes_, y) tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False) to_arff(X, y, self.n_classes_, tf) tf.close() # Run self.model_ = Classifier(name="weka.classifiers.trees.RandomForest", ckargs=params) self.model_.train(tf.name) os.remove(tf.name) return self def predict(self, X): tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False) to_arff(X, None, self.n_classes_, tf) tf.close() pred = np.zeros(len(X), dtype=np.int32) for i, r in enumerate(self.model_.predict(tf.name)): pred[i] = int(r.predicted[5]) os.remove(tf.name) return self.classes_[pred]
def test_IBk(self): # Train a classifier. print('Training IBk classifier...') c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K': 1}) training_fn = os.path.join(BP, 'fixtures/abalone-train.arff') c.train(training_fn, verbose=1) self.assertTrue(c._model_data) # Make a valid query. print('Using IBk classifier...') query_fn = os.path.join(BP, 'fixtures/abalone-query.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) pred0 = predictions[0] print('pred0:', pred0) pred1 = PredictionResult(actual=None, predicted=7, probability=None) print('pred1:', pred1) self.assertEqual(pred0, pred1) # Make a valid query. with self.assertRaises(PredictionError): query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) # Make a valid query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append( ['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?']) data_str0 = """% @relation test @attribute 'Sex' {F,I,M} @attribute 'Length' numeric @attribute 'Diameter' numeric @attribute 'Height' numeric @attribute 'Whole weight' numeric @attribute 'Shucked weight' numeric @attribute 'Viscera weight' numeric @attribute 'Shell weight' numeric @attribute 'Class_Rings' integer @data M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,? """ data_str1 = query.write(fmt=DENSE) # print(data_str0 # print(data_str1 self.assertEqual(data_str0, data_str1) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual( predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) # Test pickling. fn = os.path.join(BP, 'fixtures/IBk.pkl') c.save(fn) c = Classifier.load(fn) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual( predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) #print('Pickle verified.') # Make a valid dict query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append({ 'Sex': 'M', 'Length': 0.35, 'Diameter': 0.265, 'Height': 0.09, 'Whole weight': 0.2255, 'Shucked weight': 0.0995, 'Viscera weight': 0.0485, 'Shell weight': 0.07, 'Class_Rings': arff.MISSING, }) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual( predictions[0], PredictionResult(actual=None, predicted=7, probability=None))
def test_IBk(self): # Train a classifier. print('Training IBk classifier...') c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K':1}) training_fn = os.path.join(BP, 'fixtures/abalone-train.arff') c.train(training_fn, verbose=1) self.assertTrue(c._model_data) # Make a valid query. print('Using IBk classifier...') query_fn = os.path.join(BP, 'fixtures/abalone-query.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) pred0 = predictions[0] print('pred0:', pred0) pred1 = PredictionResult(actual=None, predicted=7, probability=None) print('pred1:', pred1) self.assertEqual(pred0, pred1) # Make a valid query. with self.assertRaises(PredictionError): query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff') predictions = list(c.predict(query_fn, verbose=1, cleanup=0)) # Make a valid query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append(['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?']) data_str0 = """% @relation test @attribute 'Sex' {F,I,M} @attribute 'Length' numeric @attribute 'Diameter' numeric @attribute 'Height' numeric @attribute 'Whole weight' numeric @attribute 'Shucked weight' numeric @attribute 'Viscera weight' numeric @attribute 'Shell weight' numeric @attribute 'Class_Rings' integer @data M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,? """ data_str1 = query.write(fmt=DENSE) # print(data_str0 # print(data_str1 self.assertEqual(data_str0, data_str1) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual(predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) # Test pickling. fn = os.path.join(BP, 'fixtures/IBk.pkl') c.save(fn) c = Classifier.load(fn) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual(predictions[0], PredictionResult(actual=None, predicted=7, probability=None)) #print('Pickle verified.') # Make a valid dict query manually. query = arff.ArffFile(relation='test', schema=[ ('Sex', ('M', 'F', 'I')), ('Length', 'numeric'), ('Diameter', 'numeric'), ('Height', 'numeric'), ('Whole weight', 'numeric'), ('Shucked weight', 'numeric'), ('Viscera weight', 'numeric'), ('Shell weight', 'numeric'), ('Class_Rings', 'integer'), ]) query.append({ 'Sex': 'M', 'Length': 0.35, 'Diameter': 0.265, 'Height': 0.09, 'Whole weight': 0.2255, 'Shucked weight': 0.0995, 'Viscera weight': 0.0485, 'Shell weight': 0.07, 'Class_Rings': arff.MISSING, }) predictions = list(c.predict(query, verbose=1, cleanup=0)) self.assertEqual(predictions[0], PredictionResult(actual=None, predicted=7, probability=None))