コード例 #1
0
def walk_classifier(name, data_fn, ckargs=None):

    evaluation_sets = {}  # {year: [training_list, test_list, expected_list]}

    final_query_lines = []

    # Build training data.
    year = date.today().year
    fn = data_fn
    print('Reading %s...' % fn)
    fn2 = spreadsheet_to_csv(fn)
    i = 0
    for line in read_raw_csv(fn2):
        i += 1
        print('line:', i, line)

        year = int(line['Election'].value)

        evaluation_sets.setdefault(year, [[], [], []])

        if line['Won'].value != MISSING:

            # Add line to test set.
            test_line = copy.deepcopy(line)
            evaluation_sets[year][2].append(test_line['Won'].value)
            test_line['Won'].value = MISSING
            evaluation_sets[year][1].append(test_line)

            # Add line to all future sets.
            for other_year in evaluation_sets:
                if year < other_year:
                    evaluation_sets[other_year][0].append(line)

        else:
            final_query_lines.append(line)

    accuracy = []

    final_training_data = None
    final_year = None

    # Evaluate each evaluation set.
    #pprint(evaluation_sets, indent=4)
    print('%i evaluation_sets.' % len(evaluation_sets))
    for year, data in sorted(evaluation_sets.items()):
        raw_training_data, raw_testing_data, prediction_values = data
        print('Evaluation set:', year, len(raw_training_data),
              len(raw_testing_data), len(prediction_values))

        if not raw_training_data:
            print('No training data. Skipping.')
            continue

        # Create training set.
        training_data = ArffFile(relation='presidential-candidates')
        for _line in raw_training_data:
            training_data.append(_line)
        training_data.attribute_data['Won'].update([DEMOCRAT, REPUBLICAN])
        training_data.write(open('training_data_%i.arff' % year, 'w'))

        if not raw_testing_data:
            final_training_data = training_data
            final_year = year
            print('No testing data. Skipping.')
            continue

        # Create query set.
        query_data = training_data.copy(schema_only=True)
        for _line in raw_testing_data:
            query_data.append(_line)
        query_data.write(open('query_data_%i.arff' % year, 'w'))

        # Train
        print('=' * 80)
        c = Classifier(name=name, ckargs=ckargs)
        print('Training...')
        c.train(training_data, verbose=True)

        # Test
        print('Predicting...')
        predictions = c.predict(query_data, verbose=True, distribution=True)
        print('predictions:')
        for predicted_value, actual_value in zip(predictions,
                                                 prediction_values):
            print('predicted_value =', predicted_value, 'actual_value =',
                  actual_value)
            accuracy.append(predicted_value.predicted == actual_value)

    print('-' * 80)
    accuracy_history = accuracy
    if accuracy:
        accuracy = sum(accuracy) / float(len(accuracy))
    else:
        accuracy = None
    print('accuracy_history:', accuracy_history)
    print('accuracy:', accuracy)

    # Make final prediction.
    predicted_cls = None
    certainty = None
    if final_training_data:

        # Create final query set.
        query_data = final_training_data.copy(schema_only=True)
        for _line in final_query_lines:
            query_data.append(_line)
        query_data.write(open('query_data_%i.arff' % year, 'w'))

        # Train
        print('!' * 80)
        c = Classifier(name=name, ckargs=ckargs)
        print('Final Training...')
        c.train(final_training_data, verbose=True)

        # Test
        print('~' * 80)
        print('Final Predicting...')
        predictions = c.predict(query_data, verbose=True, distribution=True)
        print('final predictions:')
        for predicted_value in predictions:
            print('predicted_value:', predicted_value)
            with open('prediction_%i_%s.txt' % (year, name), 'w') as fout:
                print('stdout:', file=fout)
                print(c.last_training_stdout, file=fout)
                print(file=fout)
                print('stderr.begin:', file=fout)
                print(c.last_training_stderr, file=fout)
                print('stderr.end:', file=fout)
                print(file=fout)
                print('predicted_value.probability:',
                      predicted_value.probability,
                      file=fout)
                predicted_cls = predicted_value.predicted
                certainty = predicted_value.certainty

    else:
        raise Exception(
            'No final training data! Are there no empty "won" columns?')

    return accuracy, predicted_cls, certainty
コード例 #2
0
 def train(self, trainingSet):
     global classifier
     classifier = Classifier(name='weka.classifiers.trees.J48',
                             ckargs={'-x': 10})
     classifier.train(trainingSet, trainingSet, verbose=0)
     print "Classifier is Trained"
コード例 #3
0
 def train(self, trainingSet):
     global classifier
     classifier = Classifier(name="weka.classifiers.trees.J48", ckargs={"-x": 10})
     classifier.train(trainingSet, trainingSet, verbose=0)
     print "Classifier is Trained"
コード例 #4
0
ファイル: wrapper.py プロジェクト: ASBoldt/phd-thesis
class WekaRandomForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=10,
                       max_depth=None,
                       max_features="auto",
                       random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, X, y):
        # Check params
        self.n_features_ = X.shape[1]
        random_state = check_random_state(self.random_state)

        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        params = {}
        params["-I"] = self.n_estimators
        params["-K"] = max_features
        params["-depth"] = 0 if self.max_depth is None else self.max_depth
        params["-no-cv"] = None
        params["-s"] = random_state.randint(1000000)

        # Convert data
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        y = np.searchsorted(self.classes_, y)

        tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False)
        to_arff(X, y, self.n_classes_, tf)
        tf.close()

        # Run
        self.model_ = Classifier(name="weka.classifiers.trees.RandomForest", ckargs=params)
        self.model_.train(tf.name)
        os.remove(tf.name)

        return self

    def predict(self, X):
        tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False)
        to_arff(X, None, self.n_classes_, tf)
        tf.close()

        pred = np.zeros(len(X), dtype=np.int32)

        for i, r in enumerate(self.model_.predict(tf.name)):
            pred[i] = int(r.predicted[5])

        os.remove(tf.name)

        return self.classes_[pred]
コード例 #5
0
    def test_IBk(self):

        # Train a classifier.
        print('Training IBk classifier...')
        c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K': 1})
        training_fn = os.path.join(BP, 'fixtures/abalone-train.arff')
        c.train(training_fn, verbose=1)
        self.assertTrue(c._model_data)

        # Make a valid query.
        print('Using IBk classifier...')
        query_fn = os.path.join(BP, 'fixtures/abalone-query.arff')
        predictions = list(c.predict(query_fn, verbose=1, cleanup=0))
        pred0 = predictions[0]
        print('pred0:', pred0)
        pred1 = PredictionResult(actual=None, predicted=7, probability=None)
        print('pred1:', pred1)
        self.assertEqual(pred0, pred1)

        # Make a valid query.
        with self.assertRaises(PredictionError):
            query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff')
            predictions = list(c.predict(query_fn, verbose=1, cleanup=0))

        # Make a valid query manually.
        query = arff.ArffFile(relation='test',
                              schema=[
                                  ('Sex', ('M', 'F', 'I')),
                                  ('Length', 'numeric'),
                                  ('Diameter', 'numeric'),
                                  ('Height', 'numeric'),
                                  ('Whole weight', 'numeric'),
                                  ('Shucked weight', 'numeric'),
                                  ('Viscera weight', 'numeric'),
                                  ('Shell weight', 'numeric'),
                                  ('Class_Rings', 'integer'),
                              ])
        query.append(
            ['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?'])
        data_str0 = """% 
@relation test
@attribute 'Sex' {F,I,M}
@attribute 'Length' numeric
@attribute 'Diameter' numeric
@attribute 'Height' numeric
@attribute 'Whole weight' numeric
@attribute 'Shucked weight' numeric
@attribute 'Viscera weight' numeric
@attribute 'Shell weight' numeric
@attribute 'Class_Rings' integer
@data
M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,?
"""
        data_str1 = query.write(fmt=DENSE)
        #        print(data_str0
        #        print(data_str1
        self.assertEqual(data_str0, data_str1)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(
            predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))

        # Test pickling.
        fn = os.path.join(BP, 'fixtures/IBk.pkl')
        c.save(fn)
        c = Classifier.load(fn)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(
            predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
        #print('Pickle verified.')

        # Make a valid dict query manually.
        query = arff.ArffFile(relation='test',
                              schema=[
                                  ('Sex', ('M', 'F', 'I')),
                                  ('Length', 'numeric'),
                                  ('Diameter', 'numeric'),
                                  ('Height', 'numeric'),
                                  ('Whole weight', 'numeric'),
                                  ('Shucked weight', 'numeric'),
                                  ('Viscera weight', 'numeric'),
                                  ('Shell weight', 'numeric'),
                                  ('Class_Rings', 'integer'),
                              ])
        query.append({
            'Sex': 'M',
            'Length': 0.35,
            'Diameter': 0.265,
            'Height': 0.09,
            'Whole weight': 0.2255,
            'Shucked weight': 0.0995,
            'Viscera weight': 0.0485,
            'Shell weight': 0.07,
            'Class_Rings': arff.MISSING,
        })
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(
            predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
コード例 #6
0
ファイル: wrapper.py プロジェクト: thiyangt/phd-thesis-1
class WekaRandomForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 n_estimators=10,
                 max_depth=None,
                 max_features="auto",
                 random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, X, y):
        # Check params
        self.n_features_ = X.shape[1]
        random_state = check_random_state(self.random_state)

        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        params = {}
        params["-I"] = self.n_estimators
        params["-K"] = max_features
        params["-depth"] = 0 if self.max_depth is None else self.max_depth
        params["-no-cv"] = None
        params["-s"] = random_state.randint(1000000)

        # Convert data
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        y = np.searchsorted(self.classes_, y)

        tf = tempfile.NamedTemporaryFile(mode="w",
                                         suffix=".arff",
                                         dir="/dev/shm",
                                         delete=False)
        to_arff(X, y, self.n_classes_, tf)
        tf.close()

        # Run
        self.model_ = Classifier(name="weka.classifiers.trees.RandomForest",
                                 ckargs=params)
        self.model_.train(tf.name)
        os.remove(tf.name)

        return self

    def predict(self, X):
        tf = tempfile.NamedTemporaryFile(mode="w",
                                         suffix=".arff",
                                         dir="/dev/shm",
                                         delete=False)
        to_arff(X, None, self.n_classes_, tf)
        tf.close()

        pred = np.zeros(len(X), dtype=np.int32)

        for i, r in enumerate(self.model_.predict(tf.name)):
            pred[i] = int(r.predicted[5])

        os.remove(tf.name)

        return self.classes_[pred]
コード例 #7
0
ファイル: tests.py プロジェクト: chrisspen/weka
    def test_IBk(self):
        
        # Train a classifier.
        print('Training IBk classifier...')
        c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K':1})
        training_fn = os.path.join(BP, 'fixtures/abalone-train.arff')
        c.train(training_fn, verbose=1)
        self.assertTrue(c._model_data)
        
        # Make a valid query.
        print('Using IBk classifier...')
        query_fn = os.path.join(BP, 'fixtures/abalone-query.arff')
        predictions = list(c.predict(query_fn, verbose=1, cleanup=0))
        pred0 = predictions[0]
        print('pred0:', pred0)
        pred1 = PredictionResult(actual=None, predicted=7, probability=None)
        print('pred1:', pred1)
        self.assertEqual(pred0, pred1)
            
        # Make a valid query.
        with self.assertRaises(PredictionError):
            query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff')
            predictions = list(c.predict(query_fn, verbose=1, cleanup=0))
            
        # Make a valid query manually.
        query = arff.ArffFile(relation='test', schema=[
            ('Sex', ('M', 'F', 'I')),
            ('Length', 'numeric'),
            ('Diameter', 'numeric'),
            ('Height', 'numeric'),
            ('Whole weight', 'numeric'),
            ('Shucked weight', 'numeric'),
            ('Viscera weight', 'numeric'),
            ('Shell weight', 'numeric'),
            ('Class_Rings', 'integer'),
        ])
        query.append(['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?'])
        data_str0 = """% 
@relation test
@attribute 'Sex' {F,I,M}
@attribute 'Length' numeric
@attribute 'Diameter' numeric
@attribute 'Height' numeric
@attribute 'Whole weight' numeric
@attribute 'Shucked weight' numeric
@attribute 'Viscera weight' numeric
@attribute 'Shell weight' numeric
@attribute 'Class_Rings' integer
@data
M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,?
"""
        data_str1 = query.write(fmt=DENSE)
#        print(data_str0
#        print(data_str1
        self.assertEqual(data_str0, data_str1)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
        
        # Test pickling.
        fn = os.path.join(BP, 'fixtures/IBk.pkl')
        c.save(fn)
        c = Classifier.load(fn)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
        #print('Pickle verified.')
        
        # Make a valid dict query manually.
        query = arff.ArffFile(relation='test', schema=[
            ('Sex', ('M', 'F', 'I')),
            ('Length', 'numeric'),
            ('Diameter', 'numeric'),
            ('Height', 'numeric'),
            ('Whole weight', 'numeric'),
            ('Shucked weight', 'numeric'),
            ('Viscera weight', 'numeric'),
            ('Shell weight', 'numeric'),
            ('Class_Rings', 'integer'),
        ])
        query.append({
            'Sex': 'M',
            'Length': 0.35,
            'Diameter': 0.265,
            'Height': 0.09,
            'Whole weight': 0.2255,
            'Shucked weight': 0.0995,
            'Viscera weight': 0.0485,
            'Shell weight': 0.07,
            'Class_Rings': arff.MISSING,
        })
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))