Exemple #1
0
def naiveBayes(data):
	
	classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"])
	nfolds=13
	rnd = Random(0)
	evaluation = Evaluation(data)
	evaluation.crossvalidate_model(classifier, data,
	nfolds, rnd)
	print(" Naive Bayes Cross-validation information")
	print(evaluation.summary())
	print("precision: " + str(evaluation.precision(1)))
	print("recall: " + str(evaluation.recall(1)))
	print("F-measure: " + str(evaluation.f_measure(1)))
	print("==confusion matrix==")
	print("     a     b")
	print(evaluation.confusion_matrix)
	print
	#write to file
	f = open("naiveeval.txt", "w")
	f.write(evaluation.summary()) 
	f.write("\n")
	f.write("==confusion matrix==\n")
	f.write("     a       b\n")
	for item in evaluation.confusion_matrix:
		f.write("%s\n" % item)
	f.close() 
	#plot roc graph
	plcls.plot_roc(evaluation, title="Naive Bayes ROC", outfile="NBROC", wait=True)
	
	return evaluation.percent_correct
def f_smote():
    jvm.start()

    train_data, test_data = b_i_impute_data()

    train_data = train_data[:10000]
    y_train = train_data["class"]
    x_train = train_data.drop("class", axis=1)

    sm = SMOTE(ratio="minority")
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns)
    y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"])
    train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1)
    print_f("smote train data shape", train_data_sm_df.shape)
    train_data_sm_df.to_csv("./train_data_sm.csv", index=False)

    train_data_sm = converters.load_any_file("train_data_sm.csv")
    train_data_sm.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print_f("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print_f("bulding classifier")
    cls.build_classifier(train_data_sm)
    print_f("Evaluating")
    evl = Evaluation(train_data_sm)

    evl.crossvalidate_model(cls, train_data_sm, 5, Random(1))
    print_f("Train Accuracy:", evl.percent_correct)
    print_f("Train summary")
    print_f(evl.summary())
    print_f("Train class details")
    print_f(evl.class_details())
    print_f("Train confusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl,
                   class_index=[0, 1],
                   wait=True,
                   outfile="./plots/2_f_smote_10k.png")
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)

    evl = Evaluation(test_data)
    print_f("testing model")
    evl.test_model(cls, test_data)
    print_f("Test Accuracy:", evl.percent_correct)
    print_f("Test summary")
    print_f(evl.summary())
    print_f(" Testclass details")
    print_f(evl.class_details())
    print_f("Testconfusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/f_test_roc_curve.png")
def ClassifyParam(mode, binWidths):
	if not os.path.exists("classificationResults"):
		os.makedirs("classificationResults")

	if("normal" in mode):
		file = open("classificationResults/AllVsAll.csv","w") 

		file.write("BinWidth, Accuracy\n")

		for binWidth in binWidths:

			train_set = "Data/arff/TrainSet_%s.arff"%(binWidth)
			test_set = "Data/arff/TestSet_%s.arff"%(binWidth)
			print "Loading Datasets..."

			train_data = converters.load_any_file(train_set)
			test_data = converters.load_any_file(test_set)
			#Set class attribute
			train_data.class_is_last()
			test_data.class_is_last()
			print "Dataset Loaded!"


			classifier_name = "weka.classifiers.meta.FilteredClassifier"

			classifier = Classifier(classname=classifier_name, options=[
				"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
				"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])


			start_train = time.time()
			classifier.build_classifier(train_data)
			end_train = time.time()
			print "Train\t%s\t%s"%(binWidth, end_train-start_train)

			for index, inst in enumerate(test_data):
				if(index == 0):
					start_sample = time.time()
					classifier.classify_instance(inst)
					end_sample = time.time()
					print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)

			print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
			evaluation = Evaluation(test_data)
			start_batch = time.time()
			evaluation.test_model(classifier, test_data)
			end_batch = time.time()
			print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)

			
			print evaluation.summary()
			acc = evaluation.percent_correct/100.0
			print "Percent correct: " + str(acc)

			file.write("%s, %s\n"%(binWidth, acc))
		file.close()
Exemple #4
0
def create_model(input_file, output_file):
    # Load data
    data = converters.load_any_file(input_file)
    data.class_is_last()  # set class attribute

    # filter data
    print_title("Filtering Data")
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10", "-M", "-1.0", "-R", "first-last"])
    discretize.inputformat(
        data)  # let the filter know about the type of data to filter
    filtered_data = discretize.filter(data)
    print("Done! (believe it or not)")

    print_title("Build Classifier")
    classifier = Classifier(classname="weka.classifiers.trees.RandomForest",
                            options=["-I", "100", "-K", "0", "-S", "1"])
    classifier.build_classifier(filtered_data)
    print("Done! (believe it or not)")
    serialization.write_all(output_file, [classifier, discretize])
    print("Model and filter saved to ", output_file)

    evaluation = Evaluation(data)  # initialize with priors
    evaluation.crossvalidate_model(classifier, filtered_data, 10,
                                   Random(42))  # 10-fold CV
    print(evaluation.summary())
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
Exemple #5
0
    def run_naive_bayes_crossval(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "NB Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Naive_Bayes_Crossval", resultsString,
                          output_directory)
def testNB(training_data, testing_data):

    train_data = Instances.copy_instances(training_data)
    test_data = Instances.copy_instances(testing_data)

    evaluation = Evaluation(train_data)
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    classifier.build_classifier(
        train_data)  # build classifier on the training data
    evaluation.test_model(classifier,
                          test_data)  # test and evaluate model on the test set
    print("")
    print("")
    print(
        evaluation.summary(
            "--------------Naive Bayes Evaluation--------------"))
    print("Accuracy: " + str(evaluation.percent_correct))
    print("")
    print("Label\tPrecision\t\tRecall\t\t\tF-Measure")
    print("<=50K\t" + str(evaluation.precision(0)) + "\t" +
          str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0)))
    print(">50K\t" + str(evaluation.precision(1)) + "\t" +
          str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1)))
    print("Mean\t" + str(((evaluation.precision(1)) +
                          (evaluation.precision(0))) / 2) + "\t" +
          str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" +
          str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
def naivebay_classifier_weka(data):
    classifier = Classifier("weka.classifiers.bayes.NaiveBayes")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(42))
    print(evaluation.summary())
    print(evaluation.confusion_matrix)
    return classifier
def RandomTree(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RT_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RT_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RT_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False)
    value_RT = str(evl.percent_correct)
    return value_RT
def Boost_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Boost_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False)
    value_Boost_J48 = str(evaluation.percent_correct)
    return value_Boost_J48
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
def main():
    """
    Shows how to use the CostSensitiveClassifier.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.CostSensitiveClassifier",
        options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"])
    base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    classifier.classifier = base

    folds = 10
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, folds, Random(1))


    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
Exemple #12
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
def main():
    """
    Shows how to use the CostSensitiveClassifier.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.CostSensitiveClassifier",
        options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"])
    base = Classifier(classname="weka.classifiers.trees.J48",
                      options=["-C", "0.3"])
    classifier.classifier = base

    folds = 10
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, folds, Random(1))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("")
    print(
        evaluation.summary("=== " + str(folds) +
                           " -fold Cross-Validation ==="))
Exemple #14
0
    def train_weka_model(self,
                         training_data_dir,
                         save_model_dir,
                         log_file,
                         mimic_env=None):
        """
        Just runs some example code.
        """
        loader = Loader(classname="weka.core.converters.CSVLoader")
        training_data = loader.load_file(training_data_dir)
        training_data.class_is_last()

        self.classifier = Classifier(classname="weka.classifiers.trees.M5P",
                                     options=self.options)
        # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html
        self.classifier.build_classifier(training_data)
        # print(classifier)
        graph = self.classifier.graph
        node_number = float(graph.split('\n')[-3].split()[0].replace('N', ''))
        leaves_number = node_number / 2
        serialization.write(save_model_dir, self.classifier)
        # print('Leaves number is {0}'.format(leave_number), file=log_file)

        evaluation = Evaluation(training_data)
        predicts = evaluation.test_model(self.classifier, training_data)
        # return_value = None
        # if mimic_env is not None:
        predict_dictionary = {}
        for predict_index in range(len(predicts)):
            predict_value = predicts[predict_index]
            if predict_value in predict_dictionary.keys():
                predict_dictionary[predict_value].append(predict_index)
            else:
                predict_dictionary.update({predict_value: [predict_index]})

        # return_value = mimic_env.get_return(state=list(predict_dictionary.values()))
        return_value_log = mimic_env.get_return(
            state=list(predict_dictionary.values()))
        return_value_log_struct = mimic_env.get_return(
            state=list(predict_dictionary.values()), apply_structure_cost=True)
        return_value_var_reduction = mimic_env.get_return(
            state=list(predict_dictionary.values()),
            apply_variance_reduction=True)
        # print("Training return is {0}".format(return_value), file=log_file)

        summary = evaluation.summary()
        numbers = summary.split('\n')
        corr = float(numbers[1].split()[-1])
        mae = float(numbers[2].split()[-1])
        rmse = float(numbers[3].split()[-1])
        rae = float(numbers[4].split()[-2]) / 100
        rrse = float(numbers[5].split()[-2]) / 100
        # print(evl)
        # print("Training summary is "+summary, file=log_file)

        return return_value_log, return_value_log_struct, \
               return_value_var_reduction, mae, rmse, leaves_number
Exemple #15
0
def run_bayesNet(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running BayesNet on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # Use BayesNet and set options
    cls = Classifier(classname="weka.classifiers.bayes.BayesNet",
                     options=[
                         "-D", "-Q",
                         "weka.classifiers.bayes.net.search.local.TAN", "--",
                         "-P", "1", "-S", "BAYES", "-E",
                         "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                         "--", "-A", "0.5"
                     ])

    # Predictions stored in pout
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    # Evaluate data
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout)

    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.confusion_matrix)

    # Generate grid for ROC
    # plcls.plot_roc(evaluation, class_index=[0,1], wait=True)

    # mk dirs for output
    dir = dir / "bayesNet_results"
    dir.mkdir(parents=True, exist_ok=True)

    # Save summary, class details and confusion matrix to file
    result_output = filename_base + "_bayesNet_eval_results_TAN.txt"
    output_eval(evaluation, dir / result_output)

    # Save the predicited results to file
    prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt"
    output_pred(pout, dir / prediction_output)

    print("BayesNet complete")
def e_model_tree():
    # train_data, test_data = b_i_impute_data()
    # train_data.to_csv("./train_data.csv", index=False)
    # test_data.to_csv("./test_data.csv",index=False)

    jvm.start()
    train_data = converters.load_any_file("train_data.csv")
    train_data.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print("2")
    cls.build_classifier(train_data)

    print("3")
    evl = Evaluation(train_data)
    evl.crossvalidate_model(cls, train_data, 5, Random(1))
    print("Train Accuracy:", evl.percent_correct)
    print("Train summary")
    print(evl.summary())
    print("Train class details")
    print(evl.class_details())
    print("Train confusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_train_roc_curve.png")

    evl = Evaluation(test_data)
    evl.test_model(cls, test_data)
    print("Test Accuracy:", evl.percent_correct)
    print("Test summary")
    print(evl.summary())
    print(" Testclass details")
    print(evl.class_details())
    print("Testconfusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_test_roc_curve.png")
Exemple #17
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    The predictions get recorded in two different ways:
    1. in-memory via the test_model method
    2. directly to file (more memory efficient), but a separate run of making predictions

    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate and record predictions in memory
    helper.print_title("recording predictions in-memory")
    output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV",
        options=["-distribution"])
    evl = Evaluation(train)
    evl.test_model(cls, test, output=output)
    print(evl.summary())
    helper.print_info("Predictions:")
    print(output.buffer_content())

    # record/output predictions separately
    helper.print_title("recording/outputting predictions separately")
    outputfile = helper.get_tmp_dir() + "/j48_vote.csv"
    output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV",
        options=["-distribution", "-suppress", "-file", outputfile])
    output.header = test
    output.print_all(cls, test)
    helper.print_info("Predictions stored in:" + outputfile)
    # by using "-suppress" we don't store the output in memory, the following statement won't output anything
    print(output.buffer_content())
Exemple #18
0
    def test_weka_model(self,
                        testing_data_dir,
                        save_model_dir,
                        log_file,
                        mimic_env=None):
        self.classifier = Classifier(
            jobject=serialization.read(save_model_dir))

        graph = self.classifier.graph
        node_number = float(graph.split('\n')[-3].split()[0].replace('N', ''))
        leaves_number = node_number / 2
        serialization.write(save_model_dir, self.classifier)
        # print('Leaves number is {0}'.format(leave_number), file=log_file)

        loader = Loader(classname="weka.core.converters.CSVLoader")
        testing_data = loader.load_file(testing_data_dir)
        testing_data.class_is_last()

        evaluation = Evaluation(testing_data)
        predicts = evaluation.test_model(self.classifier, testing_data)

        predict_dictionary = {}
        for predict_index in range(len(predicts)):
            predict_value = predicts[predict_index]
            if predict_value in predict_dictionary.keys():
                predict_dictionary[predict_value].append(predict_index)
            else:
                predict_dictionary.update({predict_value: [predict_index]})

        return_value_log = mimic_env.get_return(
            state=list(predict_dictionary.values()))
        return_value_log_struct = mimic_env.get_return(
            state=list(predict_dictionary.values()), apply_structure_cost=True)
        return_value_var_reduction = mimic_env.get_return(
            state=list(predict_dictionary.values()),
            apply_variance_reduction=True)

        summary = evaluation.summary()
        numbers = summary.split('\n')
        corr = float(numbers[1].split()[-1])
        mae = float(numbers[2].split()[-1])
        rmse = float(numbers[3].split()[-1])
        rae = float(numbers[4].split()[-2]) / 100
        rrse = float(numbers[5].split()[-2]) / 100
        # print(evl)
        # print("Testing summary is "+summary, file=log_file)

        return return_value_log, return_value_log_struct, \
               return_value_var_reduction, mae, rmse, leaves_number
Exemple #19
0
def trainAndMakePred(train, test):
	#IBK test and prediction 
	classifierIBK = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", "5"])
	classifierIBK.build_classifier(train)
	evaluationIBK = Evaluation(train)
	predicted_labelsIBK = evaluationIBK.test_model(classifierIBK, train)
	print(" IBKTraining information ")
	print(evaluationIBK.summary())
	pred_outputIBK = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV")
	evaluationIBK = Evaluation(test)
	predicted_indicesIBK = evaluationIBK.test_model(classifierIBK, test, pred_outputIBK)
	print(" IBK Prediction information ")
	print(pred_outputIBK)
	
	#Naive bayes and prediction
	classifierNB = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"])
	classifierNB.build_classifier(train)
	evaluationNB = Evaluation(train)
	predicted_labelsNB = evaluationNB.test_model(classifierNB, train)
	print(" Naive Bayes Training information ")
	print(evaluationNB.summary())
	pred_outputNB = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV")
	evaluationNB = Evaluation(test)
	predicted_indicesNB = evaluationNB.test_model(classifierNB, test, pred_outputNB)
	print(" Naive Bayes Prediction information ")
	print(pred_outputNB)
	
	#out put predictions to file
	a = 1
	ID = 901
	f = open("predict.csv", "w")
	f.write("ID,Predict 1,Predict 2\n")
	for pred1, pred2 in zip(predicted_indicesIBK, predicted_indicesNB):
		f.write("%s,%s,%s\n" % (ID,pred1,pred2))
		ID += 1
	f.close() 
Exemple #20
0
def CV5x2(dataset,  algo, num_datasets):

	loader = Loader(classname="weka.core.converters.ArffLoader")
	data = loader.load_file(dataset)
	data.class_is_last()

	cls = Classifier(classname=algo)

	evl = Evaluation(data)
	evl.crossvalidate_model(cls, data, 2, Random(5))

	print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False))
        print(evl.matrix("=== on click prediction(confusion matrix) ==="))
	print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1)))

	return evl.area_under_roc(1)
    def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]):
        
        if arrfFile is not None:
            self.initData( arrfFile )
            
        if self.data is None:
            return 

        print 'Classificador ' + str(classname) + ' ' + ' '.join(options)
        cls = Classifier(classname=classname, options=options)
        
        evl = Evaluation(self.data)
        evl.crossvalidate_model(cls, self.data, 10, Random(1))

        print(evl.percent_correct)
        print(evl.summary())
        print(evl.class_details())
Exemple #22
0
def SimpleLogistic():
    # load a dataset
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("First_trial_classification.arff")
    data.class_is_last()  # set class attribute

    cls = Classifier(classname="weka.classifiers.functions.SimpleLogistic")
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(486), pout)

    print(evl.summary())
    print(pout.buffer_content())

    # save model
    serialization.write_all("SimpleLogistic2.model", cls)
    def cross_validate(self, detail = True):
        """Perform cross validation using trained data. 
        
        Parameters
        ----------
        detail : boolean, optional, default = True
            If true return a detailed information of cross validation.
            
        Returns
        -------
        info : string
            Info with results of cross validation.
        """
        
        #print 'cross_validation'
        
        start_time = TimeUtils.get_time()
        
        info =  "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options]))
        
        if detail == True:
            info += "Relation:\t%s\n" % (self.data.relationname)
            info += "Instances:\t%d\n" % (self.data.num_instances)
            info += "Attributes:\t%d\n\n" % (self.data.num_attributes)
        
        evl = WEvaluation(self.data)
        evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1))
        
        if detail == False:
            info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)

        info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)
        #info += str(evl.percent_correct) + "\n\n"
        
        if detail == True:
            info += "=== Stratified cross-validation ===\n"
            info += evl.summary() + "\n\n"
            
            info += str(evl.class_details()) + "\n\n"
            
            classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)]
            cm = evl.confusion_matrix
            info += Classifier.confusion_matrix(classes, cm)

        return info
Exemple #24
0
    def run_bayes_hill_split(self, output_directory, parents=1):
        # build classifier
        print("\nBuilding Bayes Classifier on training data. Parents = " +
              str(parents) + "\n")
        buildTimeStart = time.time()
        cls = Classifier(
            classname="weka.classifiers.bayes.BayesNet",
            options=[
                "-D", "-Q",
                "weka.classifiers.bayes.net.search.local.HillClimber", "--",
                "-P", "" + str(parents), "-S", "BAYES", "-E",
                "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--",
                "-A", "0.5"
            ])
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "Bayes Split Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nEvaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.test_model(cls, self.testing_data)

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nBayes Split Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Bayes_Hill_P" + str(parents) + "_", resultsString,
                          output_directory)
        self.save_results("Bayes_Hill_P" + str(parents) + "_Graph", cls.graph,
                          output_directory, True)
Exemple #25
0
    def run_crossval(self, output_directory, classifier_name,
                     classifier_weka_spec, options_list):
        # build classifier
        print("\nBuilding " + classifier_name +
              " Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname=classifier_weka_spec, options=options_list)
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = classifier_name + " Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\n\n" + classifier_name + " Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        options_string = ""
        for option in options_list:
            options_string = options_string + str(option)

        options_string = options_string.replace(".", "-")
        options_string = options_string.replace("-", "_")
        #Save Results and Cleanup
        self.save_results(classifier_name + options_string + "_Crossval",
                          resultsString, output_directory)
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
Exemple #28
0
def HOV(dataset,  algo, num_datasets):
	#Executing HOV \_*-*_/

	loader = Loader(classname="weka.core.converters.ArffLoader")
	data = loader.load_file(dataset)
	data.class_is_last()

	train, test = data.train_test_split(70.0, Random(10))

	cls = Classifier(classname=algo)
	cls.build_classifier(train)

	evl = Evaluation(train)
	evl.test_model(cls, test)

	print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False))
        print(evl.matrix("=== on click prediction(confusion matrix) ==="))
	print("For Algo"+ str(algo)+"areaUnderROC/1: for HOV " + str(evl.area_under_roc(1)))

	return evl.area_under_roc(1)
Exemple #29
0
    def crossValidate(self,
                      arrfFile=None,
                      classname="weka.classifiers.trees.J48",
                      options=["-C", "0.3"]):

        if arrfFile is not None:
            self.initData(arrfFile)

        if self.data is None:
            return

        print 'Classificador ' + str(classname) + ' ' + ' '.join(options)
        cls = Classifier(classname=classname, options=options)

        evl = Evaluation(self.data)
        evl.crossvalidate_model(cls, self.data, 10, Random(1))

        print(evl.percent_correct)
        print(evl.summary())
        print(evl.class_details())
def TrainingModel(arff, modelOutput, clsfier):
    # 启动java虚拟机
    jvm.start()
    # 导入训练集
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(arff)
    train.class_is_first()
    # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高
    cls_name = "weka.classifiers." + clsfier
    clsf = Classifier(classname=cls_name)
    clsf.build_classifier(train)
    print(clsf)
    # 建立模型
    fc = FilteredClassifier()
    fc.classifier = clsf
    evl = Evaluation(train)
    evl.crossvalidate_model(fc, train, 10, Random(1))
    print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    print(evl.matrix())
    # 结果统计
    matrixResults = evl.confusion_matrix
    TN = float(matrixResults[0][0])
    FP = float(matrixResults[0][1])
    FN = float(matrixResults[1][0])
    TP = float(matrixResults[1][1])
    TPR = TP / (TP + FN)
    TNR = TN / (FP + TN)
    PPV = TP / (TP + FP)
    NPV = TN / (TN + FN)
    print("算法: " + clsfier)
    print("敏感度 TPR: " + str(TPR))
    print("特异度 TNR: " + str(TNR))
    print("PPV: " + str(PPV))
    print("NPV: " + str(NPV))
    # 保存模型
    clsf.serialize(modelOutput, header=train)
    # 退出虚拟机
    jvm.stop()
    print("分析模型建立完成")
Exemple #31
0
def SMOreg():
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("First_trial_regression.arff")
    data.class_is_last()

    cls = KernelClassifier(classname="weka.classifiers.functions.SMOreg",
                           options=["-N", "0"])
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.2"])
    cls.kernel = kernel
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(486), pout)

    print(evl.summary())
    print(pout.buffer_content())

    # save model
    serialization.write_all("SMOreg.model2", cls)
def DecisionTree(rnd_data, folds, seed, data):

    data_size = rnd_data.num_instances
    fold_size = math.floor(data_size / folds)

    # cross-validation
    evaluation = Evaluation(rnd_data)
    for i in range(folds):
        this_fold = fold_size
        test_start = i * fold_size
        test_end = (test_start + fold_size)
        if ((data_size - test_end) / fold_size < 1):
            this_fold = data_size - test_start
        test = Instances.copy_instances(rnd_data, test_start,
                                        this_fold)  # generate validation fold
        if i == 0:
            train = Instances.copy_instances(rnd_data, test_end,
                                             data_size - test_end)
        else:
            train_1 = Instances.copy_instances(rnd_data, 0, test_start)
            train_2 = Instances.copy_instances(rnd_data, test_end,
                                               data_size - test_end)
            train = Instances.append_instances(
                train_1, train_2)  # generate training fold

        # build and evaluate classifier
        cls = Classifier(classname="weka.classifiers.trees.J48")
        cls.build_classifier(train)  # build classifier on training set
        evaluation.test_model(cls,
                              test)  # test classifier on validation/test set

    print("")
    print("=== Decision Tree ===")
    print("Classifier: " + cls.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(
        evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
Exemple #33
0
    def run_ibk_crossval(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(
            classname="weka.classifiers.lazy.IBk",
            options=[
                "-K", "3", "-W", "0", "-A",
                "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.EuclideanDistance -R first-last\""
            ])
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "IBK Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nIBK Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("IBK_Crossval", resultsString, output_directory)
Exemple #34
0
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load weather.nominal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# perform 10-fold cross-validation
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation (full):\n" + evl.summary())
cls.build_classifier(data)
print("Model:\n\n" + str(cls))

# remove attribute "outlook"
print("Removing attribute 'outlook'")
data.delete_attribute(data.attribute_by_name("outlook").index)

# perform 10-fold cross-validation (reduced dataset)
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation (without 'outlook'):\n" + evl.summary())
cls.build_classifier(data)
print("Model:\n\n" + str(cls))
Exemple #35
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
Exemple #36
0
data_dir = os.environ.get("WEKAMOOC_DATA")
if data_dir is None:
    data_dir = "." + os.sep + "data"

import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.classifiers import Classifier, Evaluation, PredictionOutput
from weka.core.classes import Random
import weka.plot.classifiers as plc

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate NaiveBayes
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1), pout)
print(evl.summary())
print(evl.matrix())
print(pout)
plc.plot_roc(evl, wait=True)

jvm.stop()
Exemple #37
0
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation

jvm.start()

for dataset in ["diabetes.arff", "breast-cancer.arff"]:
    # load dataset
    loader = Loader(classname="weka.core.converters.ArffLoader")
    fname = data_dir + os.sep + dataset
    print("\nLoading dataset: " + fname + "\n")
    data = loader.load_file(fname)
    data.class_is_last()

    # cross-validate default J48, display model
    cls = Classifier(classname="weka.classifiers.trees.J48")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("10-fold cross-validation (default):\n" + evl.summary())
    cls.build_classifier(data)
    print("Model (default):\n\n" + str(cls))

    # cross-validate unpruned J48, display model
    cls = Classifier(classname="weka.classifiers.trees.J48", options=["-U"])
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("10-fold cross-validation (unpruned):\n" + evl.summary())
    cls.build_classifier(data)
    print("Model (unpruned):\n\n" + str(cls))

jvm.stop()
Exemple #38
0
def process_classifier(runType, cls, occ, devList, fewCats, label, subtract):
	global devCount
	global save_orig
	global save_subtract
	conf_matrix = {}

	if occ:
		table = 'temp_dat_occ_vector_occ'
	else:
		table = 'temp_dat_occ_vector_2'

	writeStr = '=========================================================================================\n' + \
		'Running ' + runType + ' classifier for \'' + label + '\''
	sys.stdout.write(writeStr + '\r')
	total_conf.write(writeStr + '\n')
	sys.stdout.flush()

	if runType == 'unseen':
		i = 0
		indiv_results = {}
		for dev in devList:
			devCount += 1
			remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
			sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
				str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
			sys.stdout.flush()

			if fewCats:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC in (select * from id_fewcats_mac) '
					'and deviceMAC!=\'' + dev + '\';')
			else:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC!=\'' + dev + '\';')
			results = aws_c.fetchall()

			# Generate type list
			total_types = ['{']
			for data in results:
				if(data[-1] not in total_types):
					total_types.append('\"')
					total_types.append(data[-1])
					total_types.append('\"')
					total_types.append(',')
			total_types[-1] = '}'
			typeStr = ''.join(total_types)

			arff_train = label + '_' + dev + '_train'
			arff_test = label + '_' + dev + '_test'

			gen_arff(arff_train, typeStr, results, occ, arff_idcol)

			if fewCats:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC in (select * from id_fewcats_mac) '
					'and deviceMAC=\'' + dev + '\';')
			else:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC=\'' + dev + '\';')
			gen_arff(arff_test, typeStr, aws_c.fetchall(), occ, arff_idcol)

			train = loader.load_file(arff_train + '.arff')
			train.class_is_last()
			mv(arff_train + '.arff', master_saveDir)
			test = loader.load_file(arff_test + '.arff')
			test.class_is_last()
			mv(arff_test + '.arff', master_saveDir)

			cls.build_classifier(train)

			# output predictions
			testName = ''
			predictions = []
			for index, inst in enumerate(test):
				if testName != '':
					if testName != inst.get_string_value(inst.class_index):
						print(str(testName) + ' ' + str(inst.get_string_value(inst.class_index)))
						exit()
					else:
						testName = inst.get_string_value(inst.class_index)	
				else:
					testName = inst.get_string_value(inst.class_index)

				if testName not in conf_matrix:
					conf_matrix[testName] = {}

				pred = cls.classify_instance(inst)
				# dist = cls.distribution_for_instance(inst)
				# if(pred == inst.get_value(inst.class_index)):
				predName = inst.class_attribute.value(int(pred))
				if predName not in conf_matrix[testName]:
					conf_matrix[testName][predName] = 0
				conf_matrix[testName][predName] += 1
				predictions.append(predName)

			total = 0
			if testName != '':
				for predName in conf_matrix[testName]:
					if predName == testName:
						correct = conf_matrix[testName][predName]
						total += correct
					else:
						total += conf_matrix[testName][predName]


			# while (len(predictions) * 2) <= 100:
			# 	predictions += pyrandom.sample(predictions, len(predictions))
			# if len(predictions) < 100:
			# 	predictions += pyrandom.sample(predictions, 100 - len(predictions))

			lots_predictions = []
			while len(lots_predictions) < 10000:
				lots_predictions += pyrandom.sample(predictions, 1)

			#indiv_results[dev] = [testName, pyrandom.sample(predictions, 100)]

			indiv_results[dev] = [testName, lots_predictions]

			# while len(predictions) < 100:
			# 	predictions += pyrandom.sample(predictions, 1)

			# indiv_results[dev] = [testName, predictions]

			# indiv_results[dev] = [testName, predictions]

			# Prep to print the how-many-days graph
			# days_output.write('\n\n\"' + dev + '\"\n')


			
			#print(str(testName) + ' ' + str(correct) + ' ' + str(total) + ' ' + str(float(correct)/total))

			# i += 1
			# if i == 10:
			# 	break


		correct, total = print_conf_matrix(conf_matrix, sys.stdout, False, False, False)
		correct, total = print_conf_matrix(conf_matrix, total_conf, False, False, False)

		if subtract == 'orig':
			save_orig = copy.deepcopy(conf_matrix)
		elif subtract == 'subtract':
			save_subtract = copy.deepcopy(conf_matrix)

		final_result = round(100*float(correct)/total,2)

		writeStr = '\nCorrectly Classified Instances\t\t' + str(correct) + '\t\t' + str(final_result) + '\n' + \
			'Incorrectly Classified Instances\t' + str(total-correct) + '\t\t' + str(round(100*float(total-correct)/total,2)) + '\n' + \
			'Total Number of Instances\t\t' + str(total) + '\n'
		print(writeStr)
		total_conf.write(writeStr + '\n')

		conf_interval = 10
		total_instances = float(sum([sum([conf_matrix[test][pred] for pred in conf_matrix[test]]) for test in conf_matrix]))

		p_d = {}
		p_e = {}
		p_e_given_d = {}
		for testName in conf_matrix:
			count_d = float(sum([conf_matrix[testName][label] for label in conf_matrix[testName]]))
			p_d[testName] = count_d / total_instances
			p_e[testName] = float(sum([conf_matrix[label][testName] for label in conf_matrix if testName in conf_matrix[label]]) / total_instances)
			p_e_given_d[testName] = {}

			for predName in conf_matrix:
				if predName in conf_matrix[testName]:
					p_e_given_d[testName][predName] = conf_matrix[testName][predName] / count_d
				else:
					p_e_given_d[testName][predName] = 0

		confidence = open('confidence.dat', 'w')
		for testName in conf_matrix:
			confidence.write('\n\n\"' + testName + '\"\n')
			print(testName)

			for classEvents in range(1, (conf_interval+1)):
				numerator = math.pow(p_e_given_d[testName][testName], classEvents) * p_d[testName]
				demoninator = 0
				for otherName in conf_matrix:
					demoninator += math.pow(p_e_given_d[otherName][testName], classEvents) * p_d[otherName]
				confidence.write(str(classEvents) + '\t' + str(numerator/demoninator) + '\n')
				print(str(classEvents) + '\t' + str(numerator/demoninator)) 
			print('')

		for predName in p_e_given_d['Router/Modem']:
			print('P( ' + predName + ' | Router/Modem ):\t' + str(p_e_given_d['Router/Modem'][predName]))

		for predName in p_e_given_d['Cable Box']:
			print('P( ' + predName + ' | Cable Box ):\t' + str(p_e_given_d['Cable Box'][predName]))

		#router = open('router', 'w')
		print('Router Stuff:')
		routerDev = 'Router/Modem'
		lampDev = 'Lamp'
		cableDev = 'Cable Box'
		origClassList = ['Router/Modem', 'Cable Box', 'Lamp', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Router/Modem']

		classListList =  [['Router/Modem'] + list(listItem) for listItem in set(itertools.permutations(origClassList))]

		classListList = [
			['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp']
		]

		for idClass, classList in enumerate(classListList):
			print(idClass)
			for classEvents in range(1, (conf_interval+1)):
				numerator_router = p_d[routerDev]
				numerator_lamp = p_d[lampDev]
				numerator_cable = p_d[cableDev]
				for idx, classInst in enumerate(classList):
					if idx < classEvents:
						numerator_router *= p_e_given_d[routerDev][classInst]
						numerator_lamp *= p_e_given_d[lampDev][classInst]
						numerator_cable *= p_e_given_d[cableDev][classInst]
				demoninator = 0
				for otherName in conf_matrix:
					obsValue = p_d[otherName]
					for idx, classInst in enumerate(classList):
						if idx < classEvents:
							obsValue *= p_e_given_d[otherName][classInst]
					demoninator += obsValue
				print(str(classEvents) + '\t' + str(numerator_router/demoninator) + '\t' + str(numerator_lamp/demoninator) + '\t' + str(numerator_cable/demoninator) + '\t\"' + classList[classEvents-1]) + '\"'
			print('')

		numberDevList(indiv_results)

		eachDev = open('indiv_results.dat', 'w')
		newIDStream = open('new_id.dat', 'w')
		for devItem in indiv_results:
			print_obsResults(conf_matrix, conf_interval, p_d, p_e, p_e_given_d, indiv_results[devItem], eachDev, devItem, newIDStream)
		print('')
		print('total devices: ' + str(len(indiv_results)))
		# print('total devices: ' + str(total_devices))
		# print('total correct: ' + str(total_correct))
		# print('  pct correct: ' + str(round(100*float(total_correct)/total_devices,2)) + '\n')

		print('initial confidence: ' + str(round(100*float(sum(initial_confidence))/len(initial_confidence),2)))
		print('initial accuracy: ' + str(round(100*float(sum(initial_accuracy))/len(initial_accuracy),2)) + '\n')

		# print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_correct))/len(final_confidence_correct),2)))
		# print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_incorrect))/len(final_confidence_incorrect),2)))
		# print('final accuracy: ' + str(round(100*float(total_correct)/total_devices,2)))

		for devType in final_accuracy:
			print('final accuracy ' + devType + ' : ' + str(round(float(sum(final_accuracy[devType]))/len(final_accuracy[devType]),6)))
			print('final confidence (correct) ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType]))/len(final_confidence_correct[devType]),6)))
			if len(final_confidence_incorrect[devType]) > 0:
				print('final confidence (incorrect) ' + devType + ' : ' + str(round(float(sum(final_confidence_incorrect[devType]))/len(final_confidence_incorrect[devType]),6)))
			else:
				print('final confidence (incorrect) ' + devType + ' : ' + str(0))
			print('final confidence ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType])+sum(final_confidence_incorrect[devType]))/(len(final_confidence_correct[devType])+len(final_confidence_incorrect[devType])),2)))

		print_conf_matrix(new_conf_matrix, sys.stdout, False, False, False)

		for topType in actual_confidence_matrix:
			for botType in actual_confidence_matrix[topType]:
				storeArray = actual_confidence_matrix[topType][botType]
				if len(storeArray) > 0:
					actual_confidence_matrix[topType][botType] = round(sum(storeArray)/len(storeArray),2)
				else:
					actual_confidence_matrix[topType][botType] = 0

		print_conf_matrix(conf_matrix, sys.stdout, False, False, False)
		print_conf_matrix(actual_confidence_matrix, sys.stdout, False, False, False)
		print_conf_matrix(actual_confidence_matrix, sys.stdout, True, False, True)

		for devType in acc_over_time_dev:
			printOverTime(devType, acc_over_time_dev[devType], conf_over_time_dev[devType])
		printOverTime('total', acc_over_time, conf_over_time)

	elif runType == 'seen':
		if fewCats:
			aws_c.execute('select * from ' + table + ' ' \
				'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
				'and deviceMAC in (select * from id_fewcats_mac);')
		else:
			aws_c.execute('select * from ' + table + ' ' \
				'where duty!=0 and deviceMAC not in (select * from vector_reject);')
		results = aws_c.fetchall()

		devCount += 1
		remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
		sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
			str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
		sys.stdout.flush()

		# Generate type list
		total_types = ['{']
		for data in results:
			if(data[-1] not in total_types):
				total_types.append('\"')
				total_types.append(data[-1])
				total_types.append('\"')
				total_types.append(',')
		total_types[-1] = '}'
		typeStr = ''.join(total_types)

		arff_file = label + '_train'

		gen_arff(arff_file, typeStr, results, occ, arff_idcol)

		train = loader.load_file(arff_file + '.arff')
		train.class_is_last()
		mv(arff_file + '.arff', master_saveDir)

		cls.build_classifier(train)

		evl = Evaluation(train)
		evl.crossvalidate_model(cls, train, 10, Random(1))

		print('\n')
		#print(evl.percent_correct)
		#print(evl.class_details())
		print(evl.matrix())
		total_conf.write('\n' + evl.matrix())
		print(evl.summary())
		total_conf.write(evl.summary() + '\n')

		final_result = round(evl.percent_correct, 2)

	else:
		success = []
		for startDev in devList:
			for changeToDev in devList:
				if startDev != changeToDev:
					devCount += 1
					remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
					sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
						str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
					sys.stdout.flush()
					
					aws_c.execute('select * from temp_dat_occ_vector_2 ' \
						'where duty!=0 and deviceMAC in (\'' + startDev + '\',\'' + changeToDev + '\');')
					results = [x[:-1] + (x[1],) for x in aws_c.fetchall()]	# Class label is just the deviceMAC

					if len(results) > 10:

						# Generate type list
						typeStr = '{' + startDev + ',' + changeToDev + '}'

						arff_file = label + '_' + startDev + '_' + changeToDev + '_train'

						gen_arff(arff_file, typeStr, results, occ, arff_idcol)

						train = loader.load_file(arff_file + '.arff')
						train.class_is_last()
						mv(arff_file + '.arff', master_saveDir)

						cls.build_classifier(train)

						evl = Evaluation(train)
						evl.crossvalidate_model(cls, train, 10, Random(1))

						print('\n')
						#print(evl.percent_correct)
						#print(evl.class_details())
						print(evl.matrix())
						total_conf.write('\n' + evl.matrix())
						print(evl.summary())
						total_conf.write(evl.summary() + '\n')

						success.append(evl.percent_correct)

		if len(success) > 0:
			final_result = [sum(success)/len(success), percentile(success, 5), percentile(success, 10), percentile(success, 95)]
		else:
			final_result = False

	if label in total_results:
		print('Warning label ' + label + ' exists twice, overwriting...')
	if final_result != False:
		total_results[label] = final_result
Exemple #39
0
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
# we'll set the class attribute after filtering

# apply NominalToBinary filter and set class attribute
fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary")
fltr.inputformat(data)
filtered = fltr.filter(data)
filtered.class_is_last()

# cross-validate LinearRegression on filtered data, display model
cls = Classifier(classname="weka.classifiers.functions.LinearRegression")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)
print("10-fold cross-validation:\n" + evl.summary())
print("Predictions:\n\n" + str(pout))
cls.build_classifier(filtered)
print("Model:\n\n" + str(cls))

# use AddClassification filter with LinearRegression on filtered data
print("Applying AddClassification to filtered data:\n")
fltr = Filter(
    classname="weka.filters.supervised.attribute.AddClassification",
    options=["-W", "weka.classifiers.functions.LinearRegression", "-classification"])
fltr.inputformat(filtered)
classified = fltr.filter(filtered)
print(classified)

# convert class back to nominal
fltr = Filter(classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "9"])
Exemple #40
0
jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]:
    # train/test split 90% using classifier
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.evaluate_train_test_split(cls, data, 90.0, Random(1))
    print("\n" + classifier + " train/test split (90%):\n" + evl.summary())
    cls.build_classifier(data)
    print(classifier + " model:\n\n" + str(cls))

# calculate mean/stdev over 10 cross-validations
for classifier in [
    "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes",
        "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]:
    accuracy = []
    for i in xrange(1,11):
        cls = Classifier(classname=classifier)
        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(i))
        accuracy.append(evl.percent_correct)
    nacc = numpy.array(accuracy)
    print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc)))
Exemple #41
0
from utilities import *
import weka.core.jvm as jvm

from weka.core.converters import Loader, Saver

from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random

jvm.start(max_heap_size="3072m")

loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file("./Dataset/trainGrid.arff")
data.class_is_last()

#classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

evaluation = Evaluation(data)
#evaluation.crossvalidate_model(classifier, data, 10, Random(42))
evaluation.evaluate_train_test_split(classifier, data, 66, Random(42))
res = evaluation.summary()
res += "\n" + evaluation.matrix()
#f = open('./Dataset/resultsGrid.txt', 'w')
#f.write(res)

print res

jvm.stop()
# Build classifier on training data
cls.build_classifier(train)
#       print(cls)

#import weka.plot.graph as graph  
#graph.plot_dot_graph(cls.graph)

from weka.classifiers import Evaluation
from weka.core.classes import Random
evl = Evaluation(train)
evl.crossvalidate_model(cls, train, 10, Random(1))

print ("Kappa Score")
print (evl.kappa) # 0.50 - Not bad
print ("Evaluation Summary")
print (evl.summary()) # Accuracy: 83%

##  Test model on new data ##

evl = Evaluation(test)

from weka.classifiers import PredictionOutput
pred_output = PredictionOutput(
classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])

evl.crossvalidate_model(cls, test, 10, Random(1), pred_output)

# View complete summary of the selected model on test data
print(evl.summary())
# The kappa statistic is 45% in this case. Not surprising given the low number of instances.
# The accuracy is 84.3%, which is fair. 
Exemple #43
0
jvm.start()

# load cpu
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "cpu.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate LinearRegression, display model
print("\n--> LinearRegression\n")
cls = Classifier(classname="weka.classifiers.functions.LinearRegression")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation:\n" + evl.summary())
cls.build_classifier(data)
print("Model:\n\n" + str(cls))

# cross-validate M5P, display model
print("\n--> M5P\n")
cls = Classifier(classname="weka.classifiers.trees.M5P")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation:\n" + evl.summary())
cls.build_classifier(data)
print("Model:\n\n" + str(cls))
plg.plot_dot_graph(cls.graph)

jvm.stop()
def classify_and_save(classifier, name, outfile):
    random.seed("ML349")

    csv_header = [
                    "Game Name",
                    "SteamID",
                    "Algorithm",
                    "Number Players",
                    "%Players of Training Set",
                    "Accuracy",
                    "Precision (0)",
                    "Recall (0)",
                    "F1 (0)",
                    "Precision (1)",
                    "Recall (1)",
                    "F1 (1)"
    ]
    game_results = []

    with open("data/games_by_username_all.csv", "r") as f:
        game_list = f.next().rstrip().split(",")

    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file("data/final_train.arff")
    test = loader.load_file("data/final_test.arff")

    count = 0
    for i in itertools.chain(xrange(0, 50), random.sample(xrange(50, len(game_list)), 450)):
        train.class_index = i
        test.class_index = i
        count += 1

        classifier.build_classifier(train)

        evaluation = Evaluation(train)
        evaluation.test_model(classifier, test)

        confusion = evaluation.confusion_matrix
        num_players = sum(confusion[1])
        steam_id = repr(train.class_attribute).split(" ")[1]
        result = [
                    game_list[i],
                    steam_id,
                    name,
                    int(num_players),
                    num_players/1955,
                    evaluation.percent_correct,
                    evaluation.precision(0),
                    evaluation.recall(0),
                    evaluation.f_measure(0),
                    evaluation.precision(1),
                    evaluation.recall(1),
                    evaluation.f_measure(1)
        ]

        game_results.append(result)
        print "\nResult #{2}/500 for {0} (SteamID {1}):".format(game_list[i], steam_id, count),
        print evaluation.summary()

    with open(outfile, "wb") as f:
        csv_writer = csv.writer(f, delimiter=",")
        csv_writer.writerow(csv_header)
        for r in game_results:
            csv_writer.writerow(r)
Exemple #45
0
from weka.filters import Filter
# convert csv into arff format (weka compatable)
# use convertcsvtoarff.py file

# load arff file

loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file("reviewsinformation_task2.arff")
iris_data.class_is_last()
loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file(iris_file)
iris_data.class_is_last()

# kernel classifier
helper.print_title("Creating SMO as KernelClassifier")
kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
classifier.kernel = kernel
classifier.build_classifier(iris_data)
print("classifier: " + classifier.to_commandline())
print("model:\n" + str(classifier))

#print("model:\n" + str(classifier))


evaluation = Evaluation('test_data.arff')
evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
print(evaluation.summary())
print(evaluation.class_details())
print(evaluation.matrix())
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
dataSet20x50.class_is_last()

dataSet50x20 = loader.load_file("trainingSet/dataSet50x20.arff")
dataSet50x20.class_is_last()

classifier1 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"])
classifier2 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "11"])
classifier3 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"])

print "\n\nTraining neural network 1"
evaluation1 = Evaluation(dataSet20x20)                   
evaluation1.crossvalidate_model(classifier1, dataSet20x20, 10, Random(42))
classifier1.build_classifier(dataSet20x20)
serialization.write("trainingSet/nn1.model", classifier1)
print "\n\n====================================================== NUERAL NETWORK 1 ======================================================"
print(evaluation1.summary())
print(evaluation1.class_details())

print "Training neural network 2"
evaluation2 = Evaluation(dataSet20x50) 
evaluation2.crossvalidate_model(classifier2, dataSet20x50, 10, Random(42))
classifier2.build_classifier(dataSet20x50)
serialization.write("trainingSet/nn2.model", classifier2)
print "\n\n====================================================== NUERAL NETWORK 2 ======================================================"
print(evaluation2.summary())
print(evaluation2.class_details())

print "Training neural network 3"
evaluation3 = Evaluation(dataSet50x20) 
evaluation3.crossvalidate_model(classifier3, dataSet50x20, 10, Random(42))
classifier3.build_classifier(dataSet50x20)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "vote.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = Classifier(classname="weka.classifiers.trees.J48")

    # randomize data
    folds = 10
    seed = 1
    rnd = Random(seed)
    rand_data = Instances.copy_instances(data)
    rand_data.randomize(rnd)
    if rand_data.class_attribute.is_nominal:
        rand_data.stratify(folds)

    # perform cross-validation and add predictions
    predicted_data = None
    evaluation = Evaluation(rand_data)
    for i in xrange(folds):
        train = rand_data.train_cv(folds, i)
        # the above code is used by the StratifiedRemoveFolds filter,
        # the following code is used by the Explorer/Experimenter
        # train = rand_data.train_cv(folds, i, rnd)
        test = rand_data.test_cv(folds, i)

        # build and evaluate classifier
        cls = Classifier.make_copy(classifier)
        cls.build_classifier(train)
        evaluation.test_model(cls, test)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification", "-distribution", "-error"])
        # setting the java object directory avoids issues with correct quoting in option array
        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        addcls.filter(train)  # trains the classifier
        pred = addcls.filter(test)
        if predicted_data is None:
            predicted_data = Instances.template_instances(pred, 0)
        for n in xrange(pred.num_instances):
            predicted_data.add_instance(pred.get_instance(n))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
    print("")
    print(predicted_data)