コード例 #1
0
ファイル: model.py プロジェクト: jonmagal/recsys_challenge
    def test_model(self, test_data, empty_solution, evaluate = False):
        model_weka = None
        if os.path.isfile(self.prediction_file):
            print 'Model ' + self.name + ' already tested.'
        elif not os.path.isfile(self.model_file):
            print 'Impossible testing this model. It should be trained first.'
            return
        else: 
            print 'Starting to test_model model ' + self.name + '.'
            model_weka = Classifier(jobject = serialization.read(self.model_file)) 
            evaluation = Evaluation(data = test_data)
            evaluation.test_model(classifier = model_weka, data = test_data)
            
            predictions = evaluation.predictions()
            rows        = read_sheet(file_name = empty_solution)
            solutions   = []

            for row in rows:
                solution = [row['userid'], row['tweetid'], predictions.pop(0).predicted()]
                solutions.append(solution)
            write_the_solution_file(solutions, self.prediction_file)
            print 'Model ' + self.name + ' tested.'
        
        if evaluate == True:
            if os.path.isfile(self.evaluation_file):
                print 'Model ' + self.name + ' already evaluated.'
                return
            elif model_weka == None:
                model_weka = Classifier(jobject = serialization.read(self.model_file)) 
                evaluation = Evaluation(data = test_data)
                evaluation.test_model(classifier = model_weka, data = test_data)
            save_file(file_name = self.evaluation_file, content = evaluation.to_summary())
            print 'Model ' + self.name + ' evaluated.'
コード例 #2
0
ファイル: classify.py プロジェクト: cdw2/data_mining_cw2
    def run_naive_bayes_split(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "NB Split Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nEvaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.test_model(cls, self.testing_data)

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nNB Split Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Naive_Bayes", resultsString, output_directory)
コード例 #3
0
def testNB(training_data, testing_data):

    train_data = Instances.copy_instances(training_data)
    test_data = Instances.copy_instances(testing_data)

    evaluation = Evaluation(train_data)
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    classifier.build_classifier(
        train_data)  # build classifier on the training data
    evaluation.test_model(classifier,
                          test_data)  # test and evaluate model on the test set
    print("")
    print("")
    print(
        evaluation.summary(
            "--------------Naive Bayes Evaluation--------------"))
    print("Accuracy: " + str(evaluation.percent_correct))
    print("")
    print("Label\tPrecision\t\tRecall\t\t\tF-Measure")
    print("<=50K\t" + str(evaluation.precision(0)) + "\t" +
          str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0)))
    print(">50K\t" + str(evaluation.precision(1)) + "\t" +
          str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1)))
    print("Mean\t" + str(((evaluation.precision(1)) +
                          (evaluation.precision(0))) / 2) + "\t" +
          str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" +
          str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
コード例 #4
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
コード例 #5
0
    def testDataEvaluate(self, testDataArffFileName):
        """
        Evaluation using test data
        :param testDataArffFileName: File name for testing ARFF
        :return: TRUE if evaluation was achievable
        """
        if self.classifierInstance is not None:
            print '[Using test data for evaluation]'
            try:
                testFileFullPath = dirconfig.arffPath + testDataArffFileName + '.arff'
                testData = self.loadArffData(testFileFullPath)

                if testData is not None:
                    # Evaluate using test data
                    evaluatorInstance = Evaluation(
                        data=self.classificationData)
                    evaluatorInstance.test_model(
                        classifier=self.classifierInstance, data=testData)

                    # Store evaluation results
                    self.setEvaluationResults(evaluatorInstance)

                    return True
            except:
                return False

        return False
コード例 #6
0
    def predict_proba(self, X):
        evaluation = Evaluation(self.train_data)

        # Add class column (we can't copy X, because this is a large object, so we add the column and remove it later)
        X['class'] = None

        filename = self.to_arff(X, True)

        # Remove class column
        del X['class']

        loader = Loader("weka.core.converters.ArffLoader")
        test_data = loader.load_file(filename)
        test_data.class_is_last()

        evaluation.test_model(self.classifier, test_data)

        probas = None

        # Return probabilities
        for pred in evaluation.predictions:
            if probas is None:
                probas = pred.distribution
            else:
                probas = np.vstack([probas, pred.distribution])

        return probas
コード例 #7
0
ファイル: common_defs.py プロジェクト: silvianunes/hyperband
def train_and_eval_weka_classifier(clf, train, valid, n_instances):

    total_train_inst = train.num_instances

    percentage = (n_instances * 100) / total_train_inst

    if percentage == 100:
        opt = train
    else:
        opt, residual = train.train_test_split(percentage, Random(1))

    # opt, residual = train.train_test_split(percentage, Random(1))

    print('total_train_inst:    ', total_train_inst, '| percentage:    ',
          percentage, '| used_inst:     ', opt.num_instances)

    clf.build_classifier(opt)

    evl = Evaluation(opt)
    evl.test_model(clf, valid)
    # evl.crossvalidate_model(clf, opt, 10, Random(1))

    acc = evl.percent_correct
    auc = evl.weighted_area_under_roc
    err = evl.error_rate
    log = evl.sf_mean_scheme_entropy

    print(
        "# validating  | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}"
        .format(log, acc, auc, err))

    return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
コード例 #8
0
def test_classifier(dataset: Instances, classifier: Classifier, params: dict):
    vars = params.keys()
    vals = params.values()

    results = defaultdict(list)

    for val_combo in itertools.product(*vals):
        results["numInstances"].append(dataset.num_instances)
        results["numAttributes"].append(dataset.num_attributes)
        opts = dict(zip(vars, val_combo))

        for opt in opts:
            results[opt].append(opts[opt])
            classifier.set_property(
                opt, opts[opt] if not isinstance(opts[opt], float) else
                typeconv.double_to_float(opts[opt]))

        evl = Evaluation(dataset)
        classifier.build_classifier(dataset)
        evl.test_model(classifier, dataset)
        results["Training_Accuracy"].append(evl.percent_correct)
        results["size"].append(
            int(javabridge.call(classifier.jobject, "measureTreeSize", "()D")))
        evl.crossvalidate_model(classifier, dataset, 10, Random(1))
        results["CV_Accuracy"].append(evl.percent_correct)

    return results
コード例 #9
0
 def evaluation(self, classifier, trainingData, testingData = None):
     trainingData.set_class_index(trainingData.num_attributes() - 1)
     if testingData == None:
         evaluation = Evaluation(trainingData) 
                             # initialize with priors
         evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42))  # 10-fold CV
         return evaluation
     else:
         print "testing data exists"
         if testingData.num_attributes() == trainingData.num_attributes():
             testingData.set_class_index(testingData.num_attributes() - 1)
             evaluation = Evaluation(trainingData)   
             
             classifier.build_classifier(trainingData)
             evaluation.test_model(classifier, testingData)
             
             #for attribute in trainingData.attributes():
             #    print "train:" + str(attribute)
             #for attribute in testingData.attributes():
             #    print "test:" + str(attribute)
                 
                 
             return evaluation
         else:
             print "testing Data doesn't have same attribute with training data"
             for attribute in trainingData.attributes():
                 print "train:" + str(attribute)
             for attribute in testingData.attributes():
                 print "test:" + str(attribute)
コード例 #10
0
    def score(self, testExamples, labels):
        f = open("testingweka.arff", "w")
        f.write("@relation randomset\n")
        for j in range(len(testExamples[0])):
            f.write("@attribute feature%d real\n" % j)
        f.write("@attribute class {TRUE, FALSE}\n")
        f.write("@data\n")
        for (example, label) in zip(testExamples, labels):
            for feature in example:
                f.write("%f," % feature)
            if label == 1:
                f.write("TRUE\n")
            else:
                f.write("FALSE\n")
        f.close()

        loader = Loader(classname="weka.core.converters.ArffLoader")
        #                        options=["-H", "-B", "10000"])
        self.testingData = loader.load_file("testingweka.arff")
        self.testingData.set_class_index(self.testingData.num_attributes() - 1)

        evaluation = Evaluation(self.trainingData)
        evaluation.test_model(self.classifier, self.testingData)

        #print evaluation.percent_correct()
        #jvm.stop()
        return evaluation.percent_correct()
コード例 #11
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
コード例 #12
0
    def calcError(self, newModel, test_data_of_kfold):
        '''Return the error from the model with test data from k fold cross validation'''
        error = 0.0
        evl = Evaluation(test_data_of_kfold)
        evl.test_model(newModel, test_data_of_kfold)

        return 100 - evl.percent_correct
コード例 #13
0
    def evaluation(self, classifier, trainingData, testingData=None):
        trainingData.set_class_index(trainingData.num_attributes() - 1)
        if testingData == None:
            evaluation = Evaluation(trainingData)
            # initialize with priors
            evaluation.crossvalidate_model(classifier, trainingData, 10,
                                           Random(42))  # 10-fold CV
            return evaluation
        else:
            print "testing data exists"
            if testingData.num_attributes() == trainingData.num_attributes():
                testingData.set_class_index(testingData.num_attributes() - 1)
                evaluation = Evaluation(trainingData)

                classifier.build_classifier(trainingData)
                evaluation.test_model(classifier, testingData)

                #for attribute in trainingData.attributes():
                #    print "train:" + str(attribute)
                #for attribute in testingData.attributes():
                #    print "test:" + str(attribute)

                return evaluation
            else:
                print "testing Data doesn't have same attribute with training data"
                for attribute in trainingData.attributes():
                    print "train:" + str(attribute)
                for attribute in testingData.attributes():
                    print "test:" + str(attribute)
コード例 #14
0
ファイル: wekaExperiments.py プロジェクト: aascode/DDV
    def crossTest(this, trainingFile, classifier, testFile):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data1 = loader.load_file(trainingFile)
        data1.class_is_last()

        cls = Classifier(classname=classifier)
        cls.build_classifier(data1)

        data2 = loader.load_file(testFile)
        data2.class_is_last()

        classes = [str(code) for code in data2.class_attribute.values]
        header = ["Accuracy"]
        for name in classes:
            header += [name + " TP", name + " FP", name + " AUC ROC"]
        values = []

        evl = Evaluation(data2)
        evl.test_model(cls, data2)

        values.append(evl.percent_correct)
        for name in classes:
            index = classes.index(name)
            values += [
                evl.true_positive_rate(index) * 100,
                evl.false_positive_rate(index) * 100,
                evl.area_under_roc(index)
            ]

        this.values = values
        this.header = header
コード例 #15
0
def f_smote():
    jvm.start()

    train_data, test_data = b_i_impute_data()

    train_data = train_data[:10000]
    y_train = train_data["class"]
    x_train = train_data.drop("class", axis=1)

    sm = SMOTE(ratio="minority")
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns)
    y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"])
    train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1)
    print_f("smote train data shape", train_data_sm_df.shape)
    train_data_sm_df.to_csv("./train_data_sm.csv", index=False)

    train_data_sm = converters.load_any_file("train_data_sm.csv")
    train_data_sm.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print_f("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print_f("bulding classifier")
    cls.build_classifier(train_data_sm)
    print_f("Evaluating")
    evl = Evaluation(train_data_sm)

    evl.crossvalidate_model(cls, train_data_sm, 5, Random(1))
    print_f("Train Accuracy:", evl.percent_correct)
    print_f("Train summary")
    print_f(evl.summary())
    print_f("Train class details")
    print_f(evl.class_details())
    print_f("Train confusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl,
                   class_index=[0, 1],
                   wait=True,
                   outfile="./plots/2_f_smote_10k.png")
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)

    evl = Evaluation(test_data)
    print_f("testing model")
    evl.test_model(cls, test_data)
    print_f("Test Accuracy:", evl.percent_correct)
    print_f("Test summary")
    print_f(evl.summary())
    print_f(" Testclass details")
    print_f(evl.class_details())
    print_f("Testconfusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/f_test_roc_curve.png")
コード例 #16
0
def ClassifyParam(mode, binWidths):
	if not os.path.exists("classificationResults"):
		os.makedirs("classificationResults")

	if("normal" in mode):
		file = open("classificationResults/AllVsAll.csv","w") 

		file.write("BinWidth, Accuracy\n")

		for binWidth in binWidths:

			train_set = "Data/arff/TrainSet_%s.arff"%(binWidth)
			test_set = "Data/arff/TestSet_%s.arff"%(binWidth)
			print "Loading Datasets..."

			train_data = converters.load_any_file(train_set)
			test_data = converters.load_any_file(test_set)
			#Set class attribute
			train_data.class_is_last()
			test_data.class_is_last()
			print "Dataset Loaded!"


			classifier_name = "weka.classifiers.meta.FilteredClassifier"

			classifier = Classifier(classname=classifier_name, options=[
				"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
				"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])


			start_train = time.time()
			classifier.build_classifier(train_data)
			end_train = time.time()
			print "Train\t%s\t%s"%(binWidth, end_train-start_train)

			for index, inst in enumerate(test_data):
				if(index == 0):
					start_sample = time.time()
					classifier.classify_instance(inst)
					end_sample = time.time()
					print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)

			print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
			evaluation = Evaluation(test_data)
			start_batch = time.time()
			evaluation.test_model(classifier, test_data)
			end_batch = time.time()
			print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)

			
			print evaluation.summary()
			acc = evaluation.percent_correct/100.0
			print "Percent correct: " + str(acc)

			file.write("%s, %s\n"%(binWidth, acc))
		file.close()
コード例 #17
0
def evaluate_classifier(cls, train_data, test_data):
    """
    Evaluation
    :param cls: trained classifier
    :param train_data: data to initialize priors with
    :return: evaluation object
    """
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)

    return evl
コード例 #18
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    The predictions get recorded in two different ways:
    1. in-memory via the test_model method
    2. directly to file (more memory efficient), but a separate run of making predictions

    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate and record predictions in memory
    helper.print_title("recording predictions in-memory")
    output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV",
        options=["-distribution"])
    evl = Evaluation(train)
    evl.test_model(cls, test, output=output)
    print(evl.summary())
    helper.print_info("Predictions:")
    print(output.buffer_content())

    # record/output predictions separately
    helper.print_title("recording/outputting predictions separately")
    outputfile = helper.get_tmp_dir() + "/j48_vote.csv"
    output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV",
        options=["-distribution", "-suppress", "-file", outputfile])
    output.header = test
    output.print_all(cls, test)
    helper.print_info("Predictions stored in:" + outputfile)
    # by using "-suppress" we don't store the output in memory, the following statement won't output anything
    print(output.buffer_content())
def ClassifyWithDT(f3, test, tree , fileOut) :

    eval= Evaluation(f3)
    tree.build_classifier(f3)

    eval.test_model(tree, test)

    fileOut.write("\n\nSelf-Training   data========"+str((1-eval.error_rate)*100)+" number of instances=="+str(f3.num_instances)+"\n")
    fileOut.write("\n Error Rate=="+str(eval.error_rate) + "\n")

    fileOut.write("\n     precision   recall     areaUnderROC            \n\n");
    for i in range(test.get_instance(0).num_classes) :
        fileOut.write(str(eval.precision(i)) +"  "+str(eval.recall(i)) + "  "  +  str(eval.area_under_roc(i))+"\n")

    return eval
コード例 #20
0
def evaluate_classifier(cls, data, crossvalidate=False, n_folds=10):
    """
    Evaluation
    :param cls: trained classifier
    :param data: data to test the model on
    :param crossvalidate: True to use crossvalidation
    :param n_folds: number of folds to cross validate for
    :return: evaluation object
    """
    evl = Evaluation(data)
    if crossvalidate:
        evl.crossvalidate_model(cls, data, n_folds, Random(5))
    else:
        evl.test_model(cls, data)

    return evl
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'):
    """
    Creates model and classifies against input data. Returns accuracy statistics
    """
    # set seed so results are consistent
    random.seed('iot')

    # load data
    loader = Loader(classname='weka.core.converters.CSVLoader')
    data = loader.load_file(infile)
    data.class_is_last()

    # convert all numeric attributes to nominal
    to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal',
                        options=['-R', 'first-last'])
    to_nominal.inputformat(data)
    data = to_nominal.filter(data)

    # randomize data with constant seed
    randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize',
                       options=['-S', '42'])
    randomize.inputformat(data)

    data = randomize.filter(data)

    # create training set and testing set
    train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage',
                                  options=['-P', percentage, '-V'])
    train_percent_filter.inputformat(data)

    train = train_percent_filter.filter(data)
    test = data

    # build and test classifier
    classifier.build_classifier(train)
    evaluation = Evaluation(train)
    evaluation.test_model(classifier, test)

    # return results as array
    results = [
        approach_name,
        classifier_name,
        percentage,
        evaluation.percent_correct,
        evaluation.weighted_f_measure
    ]
    return results
コード例 #22
0
def test_weka_classifier(clf, train, test):

    clf.build_classifier(train)

    evl = Evaluation(train)
    evl.test_model(clf, test)

    acc = evl.percent_correct
    auc = evl.weighted_area_under_roc
    err = evl.error_rate
    log = evl.sf_mean_scheme_entropy

    print(
        "# testing  | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}".
        format(log, acc, auc, err))

    return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
コード例 #23
0
def case2():
    loader1 = Loader(classname="weka.core.converters.ArffLoader")
    test_file = input("Enter the name of the test file:")
    data1 = loader1.load_file(test_file)
    data1.class_is_last()
    evaluation = Evaluation(data1)
    evl = evaluation.test_model(cls, data1)
    print(evaluation.matrix("=== (confusion matrix) ==="))
def index():
    if request.method == "GET":
        return render_template('bot.html')
    if request.method == "POST":
        # jvm.stop()
        jvm.start()
        f = open("instances.arff", "a")
        args = request.form.to_dict()
        weight_lb = float(args['weight']) * 2.20462
        bmi = (weight_lb / pow(float(args['height']), 2)) * 703
        hypertensive_status = args['hypertensive_status']
        heart_disease_status = args['heart_disease_status']
        if heart_disease_status == "Yes":
            heart_disease_status = '1'
        else:
            heart_disease_status = '0'
        if hypertensive_status == "Yes":
            hypertensive_status = '1'
        else:
            hypertensive_status = '0'

        st = "\n"+args['gender']+","+args['age']+","+hypertensive_status+","+heart_disease_status+","+args['marrital_status'] + \
            ","+args['work_type']+","+args['residence']+"," + \
            args['hypertension']+","+str(bmi)+",'"+args['smoking_status'].lower()+"',?"
        print(st)
        f.write(st)
        f.close()
        objects = serialization.read_all("J48.model")
        loader = Loader(classname="weka.core.converters.ArffLoader")
        csr = Classifier(jobject=objects[0])
        output_results = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")
        data1 = loader.load_file("instances.arff")
        data1.class_is_last()
        ev2 = Evaluation(data1)
        ev2.test_model(csr, data1, output_results)

        TESTDATA = StringIO("Instance,Actual,Predicted," +
                            output_results.buffer_content())
        df = pd.read_csv(TESTDATA)
        prediction = list(df.Predicted).pop().split(":")[1]
        print(prediction)
        # jvm.stop()
        response = {"status": "200", "prediction": prediction}
        return Response(json.dumps(response, indent=2),
                        mimetype="application/json")
コード例 #25
0
ファイル: classify.py プロジェクト: cdw2/data_mining_cw2
    def run_bayes_hill_split(self, output_directory, parents=1):
        # build classifier
        print("\nBuilding Bayes Classifier on training data. Parents = " +
              str(parents) + "\n")
        buildTimeStart = time.time()
        cls = Classifier(
            classname="weka.classifiers.bayes.BayesNet",
            options=[
                "-D", "-Q",
                "weka.classifiers.bayes.net.search.local.HillClimber", "--",
                "-P", "" + str(parents), "-S", "BAYES", "-E",
                "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--",
                "-A", "0.5"
            ])
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "Bayes Split Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nEvaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.test_model(cls, self.testing_data)

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nBayes Split Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Bayes_Hill_P" + str(parents) + "_", resultsString,
                          output_directory)
        self.save_results("Bayes_Hill_P" + str(parents) + "_Graph", cls.graph,
                          output_directory, True)
コード例 #26
0
    def train_weka_model(self,
                         training_data_dir,
                         save_model_dir,
                         log_file,
                         mimic_env=None):
        """
        Just runs some example code.
        """
        loader = Loader(classname="weka.core.converters.CSVLoader")
        training_data = loader.load_file(training_data_dir)
        training_data.class_is_last()

        self.classifier = Classifier(classname="weka.classifiers.trees.M5P",
                                     options=self.options)
        # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html
        self.classifier.build_classifier(training_data)
        # print(classifier)
        graph = self.classifier.graph
        node_number = float(graph.split('\n')[-3].split()[0].replace('N', ''))
        leaves_number = node_number / 2
        serialization.write(save_model_dir, self.classifier)
        # print('Leaves number is {0}'.format(leave_number), file=log_file)

        evaluation = Evaluation(training_data)
        predicts = evaluation.test_model(self.classifier, training_data)
        # return_value = None
        # if mimic_env is not None:
        predict_dictionary = {}
        for predict_index in range(len(predicts)):
            predict_value = predicts[predict_index]
            if predict_value in predict_dictionary.keys():
                predict_dictionary[predict_value].append(predict_index)
            else:
                predict_dictionary.update({predict_value: [predict_index]})

        # return_value = mimic_env.get_return(state=list(predict_dictionary.values()))
        return_value_log = mimic_env.get_return(
            state=list(predict_dictionary.values()))
        return_value_log_struct = mimic_env.get_return(
            state=list(predict_dictionary.values()), apply_structure_cost=True)
        return_value_var_reduction = mimic_env.get_return(
            state=list(predict_dictionary.values()),
            apply_variance_reduction=True)
        # print("Training return is {0}".format(return_value), file=log_file)

        summary = evaluation.summary()
        numbers = summary.split('\n')
        corr = float(numbers[1].split()[-1])
        mae = float(numbers[2].split()[-1])
        rmse = float(numbers[3].split()[-1])
        rae = float(numbers[4].split()[-2]) / 100
        rrse = float(numbers[5].split()[-2]) / 100
        # print(evl)
        # print("Training summary is "+summary, file=log_file)

        return return_value_log, return_value_log_struct, \
               return_value_var_reduction, mae, rmse, leaves_number
コード例 #27
0
ファイル: classify.py プロジェクト: cdw2/data_mining_cw3
    def run_split(self, output_directory, classifier_name,
                  classifier_weka_spec, options_list):
        # build classifier
        print("\nBuilding " + classifier_name +
              " Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname=classifier_weka_spec, options=options_list)
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = classifier_name + "Split Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nEvaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.test_model(cls, self.testing_data)

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\n\n" + classifier_name + "Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        options_string = ""
        for option in options_list:
            options_string = options_string + str(option)

        options_string = options_string.replace(".", "-")
        options_string = options_string.replace("-", "_")
        #Save Results and Cleanup
        self.save_results(classifier_name + options_string + "_Split",
                          resultsString, output_directory)
コード例 #28
0
def e_model_tree():
    # train_data, test_data = b_i_impute_data()
    # train_data.to_csv("./train_data.csv", index=False)
    # test_data.to_csv("./test_data.csv",index=False)

    jvm.start()
    train_data = converters.load_any_file("train_data.csv")
    train_data.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print("2")
    cls.build_classifier(train_data)

    print("3")
    evl = Evaluation(train_data)
    evl.crossvalidate_model(cls, train_data, 5, Random(1))
    print("Train Accuracy:", evl.percent_correct)
    print("Train summary")
    print(evl.summary())
    print("Train class details")
    print(evl.class_details())
    print("Train confusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_train_roc_curve.png")

    evl = Evaluation(test_data)
    evl.test_model(cls, test_data)
    print("Test Accuracy:", evl.percent_correct)
    print("Test summary")
    print(evl.summary())
    print(" Testclass details")
    print(evl.class_details())
    print("Testconfusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_test_roc_curve.png")
コード例 #29
0
def case2():
    loader1 = Loader(classname="weka.core.converters.ArffLoader")
    file = input("Enter the name of the  model file:")
    cls2 = Classifier(jobject=serialization.read(file))
    test_file = input("Enter the name of the test file:")
    data1 = loader1.load_file(test_file)
    data1.class_is_last()
    evaluation = Evaluation(data1)
    evl = evaluation.test_model(cls2, data1)
    print(evaluation.matrix("=== (confusion matrix) ==="))
コード例 #30
0
ファイル: ui.py プロジェクト: desaianeri/BTechProject
def HOV(dataset,  algo, num_datasets):
	#Executing HOV \_*-*_/

	loader = Loader(classname="weka.core.converters.ArffLoader")
	data = loader.load_file(dataset)
	data.class_is_last()

	train, test = data.train_test_split(70.0, Random(10))

	cls = Classifier(classname=algo)
	cls.build_classifier(train)

	evl = Evaluation(train)
	evl.test_model(cls, test)

	print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False))
        print(evl.matrix("=== on click prediction(confusion matrix) ==="))
	print("For Algo"+ str(algo)+"areaUnderROC/1: for HOV " + str(evl.area_under_roc(1)))

	return evl.area_under_roc(1)
コード例 #31
0
def train_and_eval_weka_classifier(clf, train, valid, n_instances):

    # total_inst = train.num_instances

    total_train_inst = train.num_instances

    percentage = (n_instances * 100) / total_train_inst

    if percentage == 100:
        opt = train
    else:
        opt, extra = train.train_test_split(percentage, Random(1))

    # inst_train2 = train2.num_instances

    print('total_train_inst:    ', total_train_inst, '| percentage:    ',
          percentage, '| used_inst:     ', opt.num_instances)

    import signal

    class AlarmException(Exception):
        pass

    def alarmHandler(signum, frame):
        raise AlarmException

    clf.build_classifier(opt)

    evl = Evaluation(opt)
    evl.test_model(clf, valid)

    acc = evl.percent_correct
    auc = evl.weighted_area_under_roc
    err = evl.error_rate
    log = evl.sf_mean_scheme_entropy

    print(
        "# validating  | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}"
        .format(log, acc, auc, err))

    return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
コード例 #32
0
def HOV(dataset, algo):
    print "inside hov"
    print("dataset ----" + dataset)
    print("algorithm ----" + algo)

    #Executing HOV \_*-*_/

    #	jvm.start(packages=True)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(dataset)
    data.class_is_last()

    train, test = data.train_test_split(70.0, Random(10))

    cls = Classifier(classname=algo)
    cls.build_classifier(train)

    evl = Evaluation(train)
    evl.test_model(cls, test)

    return (str(evl.area_under_roc(1)))
コード例 #33
0
def get_action(temp_data_dir, classifier, loader):
    data = loader.load_file(temp_data_dir)
    data.class_is_last()

    evaluation = Evaluation(data)
    eval = evaluation.test_model(classifier, data)
    Q_list = eval.tolist()
    act = ACTION_LIST[Q_list.index(max(Q_list))]
    if act == 0:
        return [1, 0]
    else:
        return [0, 1]
コード例 #34
0
def DecisionTree(rnd_data, folds, seed, data):

    data_size = rnd_data.num_instances
    fold_size = math.floor(data_size / folds)

    # cross-validation
    evaluation = Evaluation(rnd_data)
    for i in range(folds):
        this_fold = fold_size
        test_start = i * fold_size
        test_end = (test_start + fold_size)
        if ((data_size - test_end) / fold_size < 1):
            this_fold = data_size - test_start
        test = Instances.copy_instances(rnd_data, test_start,
                                        this_fold)  # generate validation fold
        if i == 0:
            train = Instances.copy_instances(rnd_data, test_end,
                                             data_size - test_end)
        else:
            train_1 = Instances.copy_instances(rnd_data, 0, test_start)
            train_2 = Instances.copy_instances(rnd_data, test_end,
                                               data_size - test_end)
            train = Instances.append_instances(
                train_1, train_2)  # generate training fold

        # build and evaluate classifier
        cls = Classifier(classname="weka.classifiers.trees.J48")
        cls.build_classifier(train)  # build classifier on training set
        evaluation.test_model(cls,
                              test)  # test classifier on validation/test set

    print("")
    print("=== Decision Tree ===")
    print("Classifier: " + cls.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(
        evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "vote.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = Classifier(classname="weka.classifiers.trees.J48")

    # randomize data
    folds = 10
    seed = 1
    rnd = Random(seed)
    rand_data = Instances.copy_instances(data)
    rand_data.randomize(rnd)
    if rand_data.class_attribute.is_nominal:
        rand_data.stratify(folds)

    # perform cross-validation and add predictions
    predicted_data = None
    evaluation = Evaluation(rand_data)
    for i in xrange(folds):
        train = rand_data.train_cv(folds, i)
        # the above code is used by the StratifiedRemoveFolds filter,
        # the following code is used by the Explorer/Experimenter
        # train = rand_data.train_cv(folds, i, rnd)
        test = rand_data.test_cv(folds, i)

        # build and evaluate classifier
        cls = Classifier.make_copy(classifier)
        cls.build_classifier(train)
        evaluation.test_model(cls, test)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification", "-distribution", "-error"])
        # setting the java object directory avoids issues with correct quoting in option array
        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        addcls.filter(train)  # trains the classifier
        pred = addcls.filter(test)
        if predicted_data is None:
            predicted_data = Instances.template_instances(pred, 0)
        for n in xrange(pred.num_instances):
            predicted_data.add_instance(pred.get_instance(n))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
    print("")
    print(predicted_data)
コード例 #36
0
ファイル: pipeline.py プロジェクト: niruhan/candis
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
コード例 #37
0
ファイル: class-1.6.py プロジェクト: echavarria/wekamooc
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# plot
pld.scatter_plot(
    data, data.get_attribute_by_name("petalwidth").get_index(),
    data.get_attribute_by_name("petallength").get_index(),
    wait=False)

# add classifier errors to dataset
addcls = Filter(
    classname="weka.filters.supervised.attribute.AddClassification",
    options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"])
addcls.set_inputformat(data)
filtered = addcls.filter(data)
print(filtered)

# build J48
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(data)
evl = Evaluation(data)
evl.test_model(cls, data)

# plot classifier errors
plc.plot_classifier_errors(evl.predictions(), wait=True)

jvm.stop()

コード例 #38
0
ファイル: wekaWrapper.py プロジェクト: dtalbert3/nemo
	def run(self):
		# Attach JVM
		javabridge.attach()

		# Debug

		print "Classifier"
		print self.classifier
		print "Params"
		print self.parameters
		print "Model Params"
		print self.modelParams

		# Get data for testing and learning
		learnerData = self.retrieveData(self.questionID, "learner")
		testData = self.retrieveData(self.questionID, 'test')
		masterData = self.retrieveData(self.questionID, 'all')
		masterData = self.addNominals(masterData)

		# Check if there is enough correct data to run
		if (learnerData.num_instances < 1 or testData.num_instances < 1):
			self.status = self.config.NOT_ENOUGH_DATA
			return False

		# If this is a prediction and there is a valid patient, change masterData header
		patientObj = self.buildPatientObject()
		patientInstance = None
		if ((patientObj is not None) and (self.predict == 1)):
			masterData = self.addPatientNominals(patientObj, masterData)
			patientInstance = self.createPatientInstance(patientObj, masterData)
			masterData.add_instance(patientInstance)

		elif (patientObj is None) and (self.predict == 1):
			print 'No patient defined for prediction. Exiting'
			return True
		# Fix dataset headers up to match and fix instances to match headers
		masterData.delete()
		learner = masterData.copy_instances(masterData, 0, 0)
		test = masterData.copy_instances(masterData, 0, 0)
		self.addInstancesToDataset(learnerData, learner)
		self.addInstancesToDataset(testData, test)

		# Comparison of data for testing purposes
		# print 'learnerData'
		# print learnerData

		# print 'learner'
		# print learner

		# print 'testData'
		# print testData

		# print 'test'
		# print test

		# pdb.set_trace()
		# Instantiate classifier
		self.cls = Classifier(classname=self.classifier, options=self.parameters)

		# Run classifier
		self.cls.build_classifier(learner)
		# for index, inst in enumerate(learnerData):
			# prediction = self.cls.classify_instance(inst)
			# distribution = self.cls.distribution_for_instance(inst)

		# Test classifier
		evl = Evaluation(learner)
		evl.test_model(self.cls, test)

		# Store information about matrix
		self.acc = evl.percent_correct
		self.val = None

		# Convert numpy array into simple array
		confusionMatrix = []
		confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]])
		confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]])

		# Convert matrix into json format
		self.matrix = json.dumps(confusionMatrix)

		
		# print 'Classifier: ', self.classifier
		# print 'ID: ', self.questionID
		# print 'ACC: ', self.acc
		# print(evl.summary())

		# If this is a prediction... make the prediction
		if ((patientObj is not None) and (self.predict == 1)):
			masterData.add_instance(patientInstance)
			print "Running prediction on patient: "
			print masterData.get_instance(0)
			self.prediction = self.cls.classify_instance(masterData.get_instance(0))
			#self.uploadPrediction()

		# Temporarily store file to serialize to
		fileName = str(self.questionID) + self.algorithm + ".model"
		serialization.write(fileName, self.cls)

		# Open that file and store it
		self.model = None
		with open(fileName, 'rb') as f:
			self.model = f.read()

		# Remove temporary file
		os.remove(fileName)

		# Set status to awaiting feedback
		self.status = self.config.AWAITING_FEEDBACK_STATUS
		return True
コード例 #39
0
ファイル: class-2.6.py プロジェクト: echavarria/wekamooc
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.set_classifier(Classifier(classname=classifier))
    cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt))
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct())
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.to_matrix("Matrix:"))

jvm.stop()
コード例 #40
0
ファイル: class-2.4.py プロジェクト: echavarria/wekamooc
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# 1a filter data
print("Filtering data...")
fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector")
fltr.set_inputformat(data)
filtered = fltr.filter(data)
filtered.set_class_index(0)

# 1b build classifier
print("Building/evaluating classifier...")
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(filtered)
evl = Evaluation(filtered)
evl.test_model(cls, filtered)
print(evl.to_summary())
print(str(cls))
plg.plot_dot_graph(cls.graph())

# 2. filtered classifier
fname = data_dir + os.sep + "simpletext-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)
print("Building/evaluating filtered classifier...")
cls = FilteredClassifier()
cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48"))
cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector"))
cls.build_classifier(data)
コード例 #41
0
def plot_learning_curve(classifiers, train, test=None, increments=100, metric="percent_correct",
                        title="Learning curve", label_template="[#] @ $", key_loc="lower right",
                        outfile=None, wait=True):
    """
    Plots
    :param classifiers: list of Classifier template objects
    :type classifiers: list of Classifier
    :param train: dataset to use for the building the classifier, used for evaluating it test set None
    :type train: Instances
    :param test: optional dataset to use for the testing the built classifiers
    :type test: Instances
    :param increments: the increments (>= 1: # of instances, <1: percentage of dataset)
    :type increments: float
    :param metric: the name of the numeric metric to plot (Evaluation.<metric>)
    :type metric: str
    :param title: the title for the plot
    :type title: str
    :param label_template: the template for the label in the plot
                           (#: 1-based index, @: full classname, !: simple classname, $: options)
    :type label_template: str
    :param key_loc: the location string for the key
    :type key_loc: str
    :param outfile: the output file, ignored if None
    :type outfile: str
    :param wait: whether to wait for the user to close the plot
    :type wait: bool
    """

    if not plot.matplotlib_available:
        logger.error("Matplotlib is not installed, plotting unavailable!")
        return
    if not train.has_class():
        logger.error("Training set has no class attribute set!")
        return
    if (test is not None) and (train.equal_headers(test) is not None):
        logger.error("Training and test set are not compatible: " + train.equal_headers(test))
        return

    if increments >= 1:
        inc = increments
    else:
        inc = round(train.num_instances * increments)

    steps = []
    cls = []
    evls = {}
    for classifier in classifiers:
        cl = Classifier.make_copy(classifier)
        cls.append(cl)
        evls[cl] = []
    if test is None:
        tst = train
    else:
        tst = test

    for i in xrange(train.num_instances):
        if (i > 0) and (i % inc == 0):
            steps.append(i+1)
        for cl in cls:
            # train
            if cl.is_updateable:
                if i == 0:
                    tr = Instances.copy_instances(train, 0, 1)
                    cl.build_classifier(tr)
                else:
                    cl.update_classifier(train.get_instance(i))
            else:
                if (i > 0) and (i % inc == 0):
                    tr = Instances.copy_instances(train, 0, i + 1)
                    cl.build_classifier(tr)
            # evaluate
            if (i > 0) and (i % inc == 0):
                evl = Evaluation(tst)
                evl.test_model(cl, tst)
                evls[cl].append(getattr(evl, metric))

    fig, ax = plt.subplots()
    ax.set_xlabel("# of instances")
    ax.set_ylabel(metric)
    ax.set_title(title)
    fig.canvas.set_window_title(title)
    ax.grid(True)
    i = 0
    for cl in cls:
        evl = evls[cl]
        i += 1
        plot_label = label_template.\
            replace("#", str(i)).\
            replace("@", cl.classname).\
            replace("!", cl.classname[cl.classname.rfind(".") + 1:]).\
            replace("$", join_options(cl.config))
        ax.plot(steps, evl, label=plot_label)
    plt.draw()
    plt.legend(loc=key_loc, shadow=True)
    if outfile is not None:
        plt.savefig(outfile)
    if wait:
        plt.show()
コード例 #42
0
trainData = loader.load_file('segment-challenge.arff')
trainData.class_is_last()
testData = loader.load_file('segment-test.arff')
testData.class_is_last()

# Default C4.5 tree
classifier = Classifier(classname="weka.classifiers.trees.J48")

# Search for the best parameters and build a classifier with them
classifier.build_classifier(trainData)

print("\n\n=========== Classifier information ================\n\n")
print(classifier.options)
print(classifier)

print("\n\n=========== Train results ================\n\n")
evaluation = Evaluation(trainData)
evaluation.test_model(classifier, trainData)
print(classifier.to_commandline())
print(evaluation.matrix())
print("Train recognition: %0.2f%%" % evaluation.percent_correct)

print("\n\n=========== Test results ================\n\n")
evaluation = Evaluation(testData)
evaluation.test_model(classifier, testData)
print(classifier.to_commandline())
print(evaluation.matrix())
print("Test recognition: %0.2f%%" % evaluation.percent_correct)

jvm.stop()
コード例 #43
0
ファイル: irdc.py プロジェクト: fracpete/wekamooc
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name("reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name("reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name("reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
コード例 #44
0
ファイル: class-2.6.py プロジェクト: echavarria/wekamooc
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# determine baseline with ZeroR
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
zeror.build_classifier(data)
evl = Evaluation(data)
evl.test_model(zeror, data)
print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct())

print("\nHoldout 10%...")
# use seed 1-10 and perform random split with 90%
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.evaluate_train_test_split(
        Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i))
    perc.append(round(evl.percent_correct(), 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct()))

# calculate mean and standard deviation
nperc = numpy.array(perc)
print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc)))
コード例 #45
0
def classify_and_save(classifier, name, outfile):
    random.seed("ML349")

    csv_header = [
                    "Game Name",
                    "SteamID",
                    "Algorithm",
                    "Number Players",
                    "%Players of Training Set",
                    "Accuracy",
                    "Precision (0)",
                    "Recall (0)",
                    "F1 (0)",
                    "Precision (1)",
                    "Recall (1)",
                    "F1 (1)"
    ]
    game_results = []

    with open("data/games_by_username_all.csv", "r") as f:
        game_list = f.next().rstrip().split(",")

    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file("data/final_train.arff")
    test = loader.load_file("data/final_test.arff")

    count = 0
    for i in itertools.chain(xrange(0, 50), random.sample(xrange(50, len(game_list)), 450)):
        train.class_index = i
        test.class_index = i
        count += 1

        classifier.build_classifier(train)

        evaluation = Evaluation(train)
        evaluation.test_model(classifier, test)

        confusion = evaluation.confusion_matrix
        num_players = sum(confusion[1])
        steam_id = repr(train.class_attribute).split(" ")[1]
        result = [
                    game_list[i],
                    steam_id,
                    name,
                    int(num_players),
                    num_players/1955,
                    evaluation.percent_correct,
                    evaluation.precision(0),
                    evaluation.recall(0),
                    evaluation.f_measure(0),
                    evaluation.precision(1),
                    evaluation.recall(1),
                    evaluation.f_measure(1)
        ]

        game_results.append(result)
        print "\nResult #{2}/500 for {0} (SteamID {1}):".format(game_list[i], steam_id, count),
        print evaluation.summary()

    with open(outfile, "wb") as f:
        csv_writer = csv.writer(f, delimiter=",")
        csv_writer.writerow(csv_header)
        for r in game_results:
            csv_writer.writerow(r)
コード例 #46
0
ファイル: class-2.2.py プロジェクト: echavarria/wekamooc
fname = data_dir + os.sep + "segment-challenge.arff"
print("\nLoading dataset: " + fname + "\n")
train = loader.load_file(fname)
train.set_class_index(train.num_attributes() - 1)

fname = data_dir + os.sep + "segment-test.arff"
print("\nLoading dataset: " + fname + "\n")
test = loader.load_file(fname)
test.set_class_index(train.num_attributes() - 1)

# build J48
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(train)

# evaluate on test
evl = Evaluation(train)
evl.test_model(cls, test)
print("Test set accuracy: %0.0f%%" % evl.percent_correct())

# evaluate on train
evl = Evaluation(train)
evl.test_model(cls, train)
print("Train set accuracy: %0.0f%%" % evl.percent_correct())

# evaluate on random split
evl = Evaluation(train)
evl.evaluate_train_test_split(cls, train, 66.0, Random(1))
print("Random split accuracy: %0.0f%%" % evl.percent_correct())

jvm.stop()
コード例 #47
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
コード例 #48
0
grid.evaluation = "ACC"
grid.parameters = [gamma, cost]

# LibSVM is added to grid configuration
grid.classifier = classifier
# Search for the best parameters and build a classifier with them
grid.build_classifier(trainData)
best = grid.best
best.build_classifier(trainData)

print(best.options)
print("C", best.options[best.options.index("-C")+1])
print("gamma", best.options[best.options.index("-G")+1])

print("\n\n=========== Train results ================\n\n")
print(grid)
evaluation = Evaluation(trainData)
evaluation.test_model(best, trainData)
print(best.to_commandline())
print(evaluation.matrix())
print("Train recognition: %0.2f%%" % evaluation.percent_correct)

print("\n\n=========== Test results ================\n\n")
evaluation = Evaluation(testData)
evaluation.test_model(best, testData)
print(best.to_commandline())
print(evaluation.matrix())
print("Test recognition: %0.2f%%" % evaluation.percent_correct)

jvm.stop()