def experiment_file_random(path_features, path_folder_save_results, options,
                           classifier, fold, random, name):
    print("start weka")
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))
    d_results = {
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }
    data = converters.load_any_file(path_features)
    data.class_is_last()
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, fold, Random(random), pout)
    d_results['percent_correct'].append(evl.percent_correct)
    d_results['percent_incorrect'].append(evl.percent_incorrect)
    d_results['confusion_matrix'].append(
        evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv',
                     index=False)

    save = pout.buffer_content()

    with open(
            path_folder_save_results + '/' + 'prediction/' + str(name) +
            '.csv', 'w') as f:
        f.write(save)
Exemple #2
0
def case2():
    loader1 = Loader(classname="weka.core.converters.ArffLoader")
    test_file = input("Enter the name of the test file:")
    data1 = loader1.load_file(test_file)
    data1.class_is_last()
    evaluation = Evaluation(data1)
    evl = evaluation.test_model(cls, data1)
    print(evaluation.matrix("=== (confusion matrix) ==="))
def case2():
    loader1 = Loader(classname="weka.core.converters.ArffLoader")
    file = input("Enter the name of the  model file:")
    cls2 = Classifier(jobject=serialization.read(file))
    test_file = input("Enter the name of the test file:")
    data1 = loader1.load_file(test_file)
    data1.class_is_last()
    evaluation = Evaluation(data1)
    evl = evaluation.test_model(cls2, data1)
    print(evaluation.matrix("=== (confusion matrix) ==="))
def experiment_more_file(path_files, path_folder_save_results, fold, options,
                         classifier, random, name):
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    file_list = os.listdir(path_files)

    for file in file_list:
        if ".csv" not in file:
            file_list.remove(file)

    d_results = {
        'name_file': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }

    print(file_list)

    for file in file_list:
        print(str(file))
        data = converters.load_any_file(path_files + "/" + file)

        data.class_is_last()

        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")

        evl = Evaluation(data)

        evl.crossvalidate_model(cls, data, fold, Random(random), pout)

        d_results['name_file'].append(str(file))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

        save = pout.buffer_content()

        with open(
                path_folder_save_results + '/' + 'prediction/' + str(name) +
                str(file)[:-4] + 'pred_data.csv', 'w') as f:
            f.write(save)

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv",
                     index=False)
Exemple #5
0
def CV5x2(dataset,  algo, num_datasets):

	loader = Loader(classname="weka.core.converters.ArffLoader")
	data = loader.load_file(dataset)
	data.class_is_last()

	cls = Classifier(classname=algo)

	evl = Evaluation(data)
	evl.crossvalidate_model(cls, data, 2, Random(5))

	print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False))
        print(evl.matrix("=== on click prediction(confusion matrix) ==="))
	print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1)))

	return evl.area_under_roc(1)
Exemple #6
0
def HOV(dataset,  algo, num_datasets):
	#Executing HOV \_*-*_/

	loader = Loader(classname="weka.core.converters.ArffLoader")
	data = loader.load_file(dataset)
	data.class_is_last()

	train, test = data.train_test_split(70.0, Random(10))

	cls = Classifier(classname=algo)
	cls.build_classifier(train)

	evl = Evaluation(train)
	evl.test_model(cls, test)

	print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False))
        print(evl.matrix("=== on click prediction(confusion matrix) ==="))
	print("For Algo"+ str(algo)+"areaUnderROC/1: for HOV " + str(evl.area_under_roc(1)))

	return evl.area_under_roc(1)
def TrainingModel(arff, modelOutput, clsfier):
    # 启动java虚拟机
    jvm.start()
    # 导入训练集
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(arff)
    train.class_is_first()
    # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高
    cls_name = "weka.classifiers." + clsfier
    clsf = Classifier(classname=cls_name)
    clsf.build_classifier(train)
    print(clsf)
    # 建立模型
    fc = FilteredClassifier()
    fc.classifier = clsf
    evl = Evaluation(train)
    evl.crossvalidate_model(fc, train, 10, Random(1))
    print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    print(evl.matrix())
    # 结果统计
    matrixResults = evl.confusion_matrix
    TN = float(matrixResults[0][0])
    FP = float(matrixResults[0][1])
    FN = float(matrixResults[1][0])
    TP = float(matrixResults[1][1])
    TPR = TP / (TP + FN)
    TNR = TN / (FP + TN)
    PPV = TP / (TP + FP)
    NPV = TN / (TN + FN)
    print("算法: " + clsfier)
    print("敏感度 TPR: " + str(TPR))
    print("特异度 TNR: " + str(TNR))
    print("PPV: " + str(PPV))
    print("NPV: " + str(NPV))
    # 保存模型
    clsf.serialize(modelOutput, header=train)
    # 退出虚拟机
    jvm.stop()
    print("分析模型建立完成")
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
Exemple #9
0
def experiment_sequential_file(path_indices, path_features,
                               path_folder_save_results, options, classifier,
                               name, indicator_col, images):
    ind_f = load(path_indices)
    lst = ind_f.files

    for item in lst:
        ind = ind_f[item] + 1

    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    data = converters.load_any_file(path_features)

    ind = np.append(ind, len(data))

    data.class_is_last()

    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")

    d_results = {
        'index': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'precision': [],
        'recall': [],
        'f-score': [],
        'confusion_matrix': []
    }

    for j in range(len(ind) - 1):
        first = ind[j]

        if j == len(ind) - 2:
            last = ind[j + 1]
        else:
            last = ind[j + 1] - 1

        d_test = data.subset(row_range=str(first) + '-' + str(last))

        if j == 0:  # first
            d_train = data.subset(row_range=str(last + 1) + '-' +
                                  str(ind[-1]))  # last element
            print(str(last + 1) + '-' + str(ind[-1]))
        elif j == len(ind) - 2:  # last
            d_train = data.subset(row_range='1-' +
                                  str(first - 1))  # last element
            print('1-' + str(first - 1))
        else:  # central
            s = '1-' + str(first - 1) + ',' + str(last + 1) + '-' + str(
                ind[-1])
            print(s)
            d_train = data.subset(row_range=s)

        cls.build_classifier(d_train)

        evl = Evaluation(data)
        evl.test_model(cls, d_test, pout)

        # print(type(d_train))
        # print(type(d_test))

        d_results['index'].append(str(ind[j]))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['precision'].append(evl.precision(1))
        d_results['recall'].append(evl.recall(1))
        d_results['f-score'].append(evl.f_measure(1))
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

    save = pout.buffer_content()

    check_folder_or_create(path_folder_save_results + '/' + 'prediction')

    with open(
            path_folder_save_results + '/' + 'prediction/' + name +
            'pred_data.csv', 'w') as f:
        f.write(save)

    buffer_save = pd.read_csv(path_folder_save_results + '/' + 'prediction/' +
                              name + 'pred_data.csv',
                              index_col=False,
                              header=None)

    col_label = buffer_save[1]
    col_prediction = buffer_save[2]
    col_different = buffer_save[3]

    create_prediction(col_label, col_prediction, col_different, indicator_col,
                      images, name, path_folder_save_results + '/prediction/')

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv',
                     index=False)
Exemple #10
0
def experiment_more_file(path_files,
                         path_folder_save_results,
                         fold,
                         options,
                         classifier,
                         random,
                         name,
                         voting=False):
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    file_list = os.listdir(path_files)

    for file in file_list:
        if ".csv" not in file:
            file_list.remove(file)

    d_results = {
        'name_file': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'precision': [],
        'recall': [],
        'f-score': [],
        'confusion_matrix': []
    }

    for file in file_list:
        indicator_table = pd.read_csv(path_files + '/indicator/' + file[0] +
                                      '_indicator.csv')
        indicator = list(indicator_table['indicator'])
        images = list(indicator_table['image'])

        data = converters.load_any_file(path_files + "/" + file)

        data.class_is_last()

        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")

        evl = Evaluation(data)

        evl.crossvalidate_model(cls, data, fold, Random(random), pout)

        d_results['name_file'].append(str(file))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['precision'].append(evl.precision(1))
        d_results['recall'].append(evl.recall(1))
        d_results['f-score'].append(evl.f_measure(1))
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

        save = pout.buffer_content()

        check_folder_or_create(path_folder_save_results + '/' + name + '/' +
                               'prediction')

        with open(
                path_folder_save_results + '/' + name + '/' +
                'prediction/pred_data.csv', 'w') as f:
            f.write(save)

        buffer_save = pd.read_csv(path_folder_save_results + '/' + name + '/' +
                                  'prediction/pred_data.csv',
                                  index_col=False)

        col_label = buffer_save['actual']
        col_prediction = buffer_save['predicted']
        col_different = buffer_save['error']

        create_prediction(
            col_label, col_prediction, col_different, indicator, images,
            file[:-4], path_folder_save_results + '/' + name + '/prediction/')

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv")
Exemple #11
0
def process_classifier(runType, cls, occ, devList, fewCats, label, subtract):
	global devCount
	global save_orig
	global save_subtract
	conf_matrix = {}

	if occ:
		table = 'temp_dat_occ_vector_occ'
	else:
		table = 'temp_dat_occ_vector_2'

	writeStr = '=========================================================================================\n' + \
		'Running ' + runType + ' classifier for \'' + label + '\''
	sys.stdout.write(writeStr + '\r')
	total_conf.write(writeStr + '\n')
	sys.stdout.flush()

	if runType == 'unseen':
		i = 0
		indiv_results = {}
		for dev in devList:
			devCount += 1
			remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
			sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
				str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
			sys.stdout.flush()

			if fewCats:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC in (select * from id_fewcats_mac) '
					'and deviceMAC!=\'' + dev + '\';')
			else:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC!=\'' + dev + '\';')
			results = aws_c.fetchall()

			# Generate type list
			total_types = ['{']
			for data in results:
				if(data[-1] not in total_types):
					total_types.append('\"')
					total_types.append(data[-1])
					total_types.append('\"')
					total_types.append(',')
			total_types[-1] = '}'
			typeStr = ''.join(total_types)

			arff_train = label + '_' + dev + '_train'
			arff_test = label + '_' + dev + '_test'

			gen_arff(arff_train, typeStr, results, occ, arff_idcol)

			if fewCats:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC in (select * from id_fewcats_mac) '
					'and deviceMAC=\'' + dev + '\';')
			else:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC=\'' + dev + '\';')
			gen_arff(arff_test, typeStr, aws_c.fetchall(), occ, arff_idcol)

			train = loader.load_file(arff_train + '.arff')
			train.class_is_last()
			mv(arff_train + '.arff', master_saveDir)
			test = loader.load_file(arff_test + '.arff')
			test.class_is_last()
			mv(arff_test + '.arff', master_saveDir)

			cls.build_classifier(train)

			# output predictions
			testName = ''
			predictions = []
			for index, inst in enumerate(test):
				if testName != '':
					if testName != inst.get_string_value(inst.class_index):
						print(str(testName) + ' ' + str(inst.get_string_value(inst.class_index)))
						exit()
					else:
						testName = inst.get_string_value(inst.class_index)	
				else:
					testName = inst.get_string_value(inst.class_index)

				if testName not in conf_matrix:
					conf_matrix[testName] = {}

				pred = cls.classify_instance(inst)
				# dist = cls.distribution_for_instance(inst)
				# if(pred == inst.get_value(inst.class_index)):
				predName = inst.class_attribute.value(int(pred))
				if predName not in conf_matrix[testName]:
					conf_matrix[testName][predName] = 0
				conf_matrix[testName][predName] += 1
				predictions.append(predName)

			total = 0
			if testName != '':
				for predName in conf_matrix[testName]:
					if predName == testName:
						correct = conf_matrix[testName][predName]
						total += correct
					else:
						total += conf_matrix[testName][predName]


			# while (len(predictions) * 2) <= 100:
			# 	predictions += pyrandom.sample(predictions, len(predictions))
			# if len(predictions) < 100:
			# 	predictions += pyrandom.sample(predictions, 100 - len(predictions))

			lots_predictions = []
			while len(lots_predictions) < 10000:
				lots_predictions += pyrandom.sample(predictions, 1)

			#indiv_results[dev] = [testName, pyrandom.sample(predictions, 100)]

			indiv_results[dev] = [testName, lots_predictions]

			# while len(predictions) < 100:
			# 	predictions += pyrandom.sample(predictions, 1)

			# indiv_results[dev] = [testName, predictions]

			# indiv_results[dev] = [testName, predictions]

			# Prep to print the how-many-days graph
			# days_output.write('\n\n\"' + dev + '\"\n')


			
			#print(str(testName) + ' ' + str(correct) + ' ' + str(total) + ' ' + str(float(correct)/total))

			# i += 1
			# if i == 10:
			# 	break


		correct, total = print_conf_matrix(conf_matrix, sys.stdout, False, False, False)
		correct, total = print_conf_matrix(conf_matrix, total_conf, False, False, False)

		if subtract == 'orig':
			save_orig = copy.deepcopy(conf_matrix)
		elif subtract == 'subtract':
			save_subtract = copy.deepcopy(conf_matrix)

		final_result = round(100*float(correct)/total,2)

		writeStr = '\nCorrectly Classified Instances\t\t' + str(correct) + '\t\t' + str(final_result) + '\n' + \
			'Incorrectly Classified Instances\t' + str(total-correct) + '\t\t' + str(round(100*float(total-correct)/total,2)) + '\n' + \
			'Total Number of Instances\t\t' + str(total) + '\n'
		print(writeStr)
		total_conf.write(writeStr + '\n')

		conf_interval = 10
		total_instances = float(sum([sum([conf_matrix[test][pred] for pred in conf_matrix[test]]) for test in conf_matrix]))

		p_d = {}
		p_e = {}
		p_e_given_d = {}
		for testName in conf_matrix:
			count_d = float(sum([conf_matrix[testName][label] for label in conf_matrix[testName]]))
			p_d[testName] = count_d / total_instances
			p_e[testName] = float(sum([conf_matrix[label][testName] for label in conf_matrix if testName in conf_matrix[label]]) / total_instances)
			p_e_given_d[testName] = {}

			for predName in conf_matrix:
				if predName in conf_matrix[testName]:
					p_e_given_d[testName][predName] = conf_matrix[testName][predName] / count_d
				else:
					p_e_given_d[testName][predName] = 0

		confidence = open('confidence.dat', 'w')
		for testName in conf_matrix:
			confidence.write('\n\n\"' + testName + '\"\n')
			print(testName)

			for classEvents in range(1, (conf_interval+1)):
				numerator = math.pow(p_e_given_d[testName][testName], classEvents) * p_d[testName]
				demoninator = 0
				for otherName in conf_matrix:
					demoninator += math.pow(p_e_given_d[otherName][testName], classEvents) * p_d[otherName]
				confidence.write(str(classEvents) + '\t' + str(numerator/demoninator) + '\n')
				print(str(classEvents) + '\t' + str(numerator/demoninator)) 
			print('')

		for predName in p_e_given_d['Router/Modem']:
			print('P( ' + predName + ' | Router/Modem ):\t' + str(p_e_given_d['Router/Modem'][predName]))

		for predName in p_e_given_d['Cable Box']:
			print('P( ' + predName + ' | Cable Box ):\t' + str(p_e_given_d['Cable Box'][predName]))

		#router = open('router', 'w')
		print('Router Stuff:')
		routerDev = 'Router/Modem'
		lampDev = 'Lamp'
		cableDev = 'Cable Box'
		origClassList = ['Router/Modem', 'Cable Box', 'Lamp', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Router/Modem']

		classListList =  [['Router/Modem'] + list(listItem) for listItem in set(itertools.permutations(origClassList))]

		classListList = [
			['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp']
		]

		for idClass, classList in enumerate(classListList):
			print(idClass)
			for classEvents in range(1, (conf_interval+1)):
				numerator_router = p_d[routerDev]
				numerator_lamp = p_d[lampDev]
				numerator_cable = p_d[cableDev]
				for idx, classInst in enumerate(classList):
					if idx < classEvents:
						numerator_router *= p_e_given_d[routerDev][classInst]
						numerator_lamp *= p_e_given_d[lampDev][classInst]
						numerator_cable *= p_e_given_d[cableDev][classInst]
				demoninator = 0
				for otherName in conf_matrix:
					obsValue = p_d[otherName]
					for idx, classInst in enumerate(classList):
						if idx < classEvents:
							obsValue *= p_e_given_d[otherName][classInst]
					demoninator += obsValue
				print(str(classEvents) + '\t' + str(numerator_router/demoninator) + '\t' + str(numerator_lamp/demoninator) + '\t' + str(numerator_cable/demoninator) + '\t\"' + classList[classEvents-1]) + '\"'
			print('')

		numberDevList(indiv_results)

		eachDev = open('indiv_results.dat', 'w')
		newIDStream = open('new_id.dat', 'w')
		for devItem in indiv_results:
			print_obsResults(conf_matrix, conf_interval, p_d, p_e, p_e_given_d, indiv_results[devItem], eachDev, devItem, newIDStream)
		print('')
		print('total devices: ' + str(len(indiv_results)))
		# print('total devices: ' + str(total_devices))
		# print('total correct: ' + str(total_correct))
		# print('  pct correct: ' + str(round(100*float(total_correct)/total_devices,2)) + '\n')

		print('initial confidence: ' + str(round(100*float(sum(initial_confidence))/len(initial_confidence),2)))
		print('initial accuracy: ' + str(round(100*float(sum(initial_accuracy))/len(initial_accuracy),2)) + '\n')

		# print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_correct))/len(final_confidence_correct),2)))
		# print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_incorrect))/len(final_confidence_incorrect),2)))
		# print('final accuracy: ' + str(round(100*float(total_correct)/total_devices,2)))

		for devType in final_accuracy:
			print('final accuracy ' + devType + ' : ' + str(round(float(sum(final_accuracy[devType]))/len(final_accuracy[devType]),6)))
			print('final confidence (correct) ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType]))/len(final_confidence_correct[devType]),6)))
			if len(final_confidence_incorrect[devType]) > 0:
				print('final confidence (incorrect) ' + devType + ' : ' + str(round(float(sum(final_confidence_incorrect[devType]))/len(final_confidence_incorrect[devType]),6)))
			else:
				print('final confidence (incorrect) ' + devType + ' : ' + str(0))
			print('final confidence ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType])+sum(final_confidence_incorrect[devType]))/(len(final_confidence_correct[devType])+len(final_confidence_incorrect[devType])),2)))

		print_conf_matrix(new_conf_matrix, sys.stdout, False, False, False)

		for topType in actual_confidence_matrix:
			for botType in actual_confidence_matrix[topType]:
				storeArray = actual_confidence_matrix[topType][botType]
				if len(storeArray) > 0:
					actual_confidence_matrix[topType][botType] = round(sum(storeArray)/len(storeArray),2)
				else:
					actual_confidence_matrix[topType][botType] = 0

		print_conf_matrix(conf_matrix, sys.stdout, False, False, False)
		print_conf_matrix(actual_confidence_matrix, sys.stdout, False, False, False)
		print_conf_matrix(actual_confidence_matrix, sys.stdout, True, False, True)

		for devType in acc_over_time_dev:
			printOverTime(devType, acc_over_time_dev[devType], conf_over_time_dev[devType])
		printOverTime('total', acc_over_time, conf_over_time)

	elif runType == 'seen':
		if fewCats:
			aws_c.execute('select * from ' + table + ' ' \
				'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
				'and deviceMAC in (select * from id_fewcats_mac);')
		else:
			aws_c.execute('select * from ' + table + ' ' \
				'where duty!=0 and deviceMAC not in (select * from vector_reject);')
		results = aws_c.fetchall()

		devCount += 1
		remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
		sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
			str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
		sys.stdout.flush()

		# Generate type list
		total_types = ['{']
		for data in results:
			if(data[-1] not in total_types):
				total_types.append('\"')
				total_types.append(data[-1])
				total_types.append('\"')
				total_types.append(',')
		total_types[-1] = '}'
		typeStr = ''.join(total_types)

		arff_file = label + '_train'

		gen_arff(arff_file, typeStr, results, occ, arff_idcol)

		train = loader.load_file(arff_file + '.arff')
		train.class_is_last()
		mv(arff_file + '.arff', master_saveDir)

		cls.build_classifier(train)

		evl = Evaluation(train)
		evl.crossvalidate_model(cls, train, 10, Random(1))

		print('\n')
		#print(evl.percent_correct)
		#print(evl.class_details())
		print(evl.matrix())
		total_conf.write('\n' + evl.matrix())
		print(evl.summary())
		total_conf.write(evl.summary() + '\n')

		final_result = round(evl.percent_correct, 2)

	else:
		success = []
		for startDev in devList:
			for changeToDev in devList:
				if startDev != changeToDev:
					devCount += 1
					remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
					sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
						str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
					sys.stdout.flush()
					
					aws_c.execute('select * from temp_dat_occ_vector_2 ' \
						'where duty!=0 and deviceMAC in (\'' + startDev + '\',\'' + changeToDev + '\');')
					results = [x[:-1] + (x[1],) for x in aws_c.fetchall()]	# Class label is just the deviceMAC

					if len(results) > 10:

						# Generate type list
						typeStr = '{' + startDev + ',' + changeToDev + '}'

						arff_file = label + '_' + startDev + '_' + changeToDev + '_train'

						gen_arff(arff_file, typeStr, results, occ, arff_idcol)

						train = loader.load_file(arff_file + '.arff')
						train.class_is_last()
						mv(arff_file + '.arff', master_saveDir)

						cls.build_classifier(train)

						evl = Evaluation(train)
						evl.crossvalidate_model(cls, train, 10, Random(1))

						print('\n')
						#print(evl.percent_correct)
						#print(evl.class_details())
						print(evl.matrix())
						total_conf.write('\n' + evl.matrix())
						print(evl.summary())
						total_conf.write(evl.summary() + '\n')

						success.append(evl.percent_correct)

		if len(success) > 0:
			final_result = [sum(success)/len(success), percentile(success, 5), percentile(success, 10), percentile(success, 95)]
		else:
			final_result = False

	if label in total_results:
		print('Warning label ' + label + ' exists twice, overwriting...')
	if final_result != False:
		total_results[label] = final_result
data.class_is_last()

"""Naive Bayes Classifier for Bug Prediction"""

classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
evaluation = Evaluation(data)
evaluation.crossvalidate_model(classifier, data, 10, Random(42), output=pred_output)
plot_cls.plot_roc(evaluation, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False)
plot_cls.plot_prc(evaluation, title="PRC bugs - NaiveBayes",class_index=range(0, data.class_attribute.num_values), wait=False)

"""Performance Metrics - Naive Bayes Classifier"""

print(evaluation.summary())
print(evaluation.class_details())
print(evaluation.matrix())

print("confusionMatrix: " + str(evaluation.confusion_matrix))
print("fMeasure: " + str(evaluation.f_measure(1)))
print("precision: " + str(evaluation.precision(1)))
print("recall: " + str(evaluation.recall(1)))

"""Random Forest Classifier"""

classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
evaluation2 = Evaluation(data)
evaluation2.crossvalidate_model(classifier2, data, 10, Random(42))
plot_cls.plot_roc(evaluation2, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False)
plot_cls.plot_prc(evaluation2, title="PRC bugs - RandomForest",class_index=range(0, data.class_attribute.num_values), wait=False)

"""Performance Evaluation Metrics - Random Forest"""
Exemple #13
0
from utilities import *
import weka.core.jvm as jvm

from weka.core.converters import Loader, Saver

from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random

jvm.start(max_heap_size="3072m")

loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file("./Dataset/trainGrid.arff")
data.class_is_last()

#classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

evaluation = Evaluation(data)
#evaluation.crossvalidate_model(classifier, data, 10, Random(42))
evaluation.evaluate_train_test_split(classifier, data, 66, Random(42))
res = evaluation.summary()
res += "\n" + evaluation.matrix()
#f = open('./Dataset/resultsGrid.txt', 'w')
#f.write(res)

print res

jvm.stop()
def CrossValidateFullDataset():
	#Tests a classifier performance with 10x cross-validation

	data_dir = "test/"
	print "Loading Dataset..."
	data = converters.load_any_file(data_dir + "full_dataset.csv")
	print "Dataset Loaded!"
	
	#Set class attribute
	data.class_is_last()


	cls_classes = [#"weka.classifiers.trees.J48",
					"weka.classifiers.trees.RandomForest",
					#"weka.classifiers.lazy.IBk"
				]

	classifiers = []
	for cls in cls_classes:
		classifiers.append(Classifier(classname=cls))


	#Regex for attribute selection
	#(Useful for testing different combinations of attributes)
	identifier_att = ".*id.*"
	
	#timeseries_att = "raw.*"
	rmNoise_att = "rmNoise.*"
	#doppler_att = "doppler.*"
	#phase_att = "phase.*"
	#music_att = "music.*"
	#beamform_att = "beamform.*"
	#music_sliding_att = "music_sliding.*"
	#music_agg_att = "music_agg.*"
	#music_angles_att = "music_angles.*"

	att_set = [rmNoise_att]

	##################################################
	#Remove instances identifier attribute
	data = FilterAttribute(identifier_att,data)
	################################################


	for att_comb in powerset(att_set):
		data_filtered = data

		for att in att_comb:
			if(len(att) != len(att_set)):
				data_filtered = FilterAttribute(att,data_filtered)
		if str(list(set(att_set) - set(att_comb)))=='[]':
			continue
		print att_set
		print att_comb
		print colored("======================================================",'green')
		print colored("Full attribute set: " + str(att_set),'green')
		print colored("Removed attributes: " + str(att_comb),'red')
		if(len(att_comb) > 0):
			print colored("Using attributes: " + str(list(set(att_set) - set(att_comb))), 'green')
		print colored("======================================================",'green')

		print data_dir

		for i, cls in enumerate(classifiers):

			evl = Evaluation(data_filtered)
			evl.crossvalidate_model(cls, data_filtered, 10, Random(1))

			print colored("=> 10x cross-validation for " + cls_classes[i], 'red')
			print(evl.summary())
			print(evl.matrix())
def ClassifyTestSet():
	#Tests a classifier performance with a dedicated test set
	# Models are evaluated for different combinations of features
	# Several classifiers may be used 

	# Load Datasets
	data_dir = "Testbed/"

	#h=open(data_dir+"training_dataset.csv","rb")

	#print h
	a = open(data_dir +"training_dataset.csv", "r") 
	print len(a.readlines())
	a = open(data_dir +"testing_dataset.csv", "r") 
	print len(a.readlines())

	training = converters.load_any_file(data_dir+"training_dataset.csv")
	training.class_is_last()

	testing = converters.load_any_file(data_dir +"testing_dataset.csv")
	testing.class_is_last() #set class attribute to be the last one listed


	#Choose classifiers to use
	cls_classes = ["weka.classifiers.trees.RandomForest",
					"weka.classifiers.trees.J48",
					"weka.classifiers.lazy.IBk"
					]

	classifiers = []
	for cls in cls_classes:
		classifiers.append(Classifier(classname=cls))

	
	#Regex for attribute selection
	#(Useful for testing different combinations of attributes)
	identifier_att = ".*id.*"
	timeseries_att = "Mic.*"
	doppler_att = "doppler.*"
	phase_att = "phase.*"
	music_att = "music.*"
	beamform_att = "beamform.*"
	att_set = [timeseries_att, doppler_att, phase_att, music_att, beamform_att]

	##################################################
	#Remove instances identifier attribute
	training = FilterAttribute(identifier_att,training)
	testing = FilterAttribute(identifier_att,testing)
	################################################


	for att_comb in powerset(att_set):
		training_filtered = training
		testing_filtered = testing

		for att in att_comb:
			if(len(att) != len(att_set)):
				training_filtered = FilterAttribute(att,training_filtered)
				testing_filtered  = FilterAttribute(att,testing_filtered)

		print colored("======================================================",'green')
		print colored("Full attribute set: " + str(att_set),'green')
		print colored("Removed attributes: " + str(att_comb),'green')
		print colored("======================================================",'green')


		for i, cls in enumerate(classifiers):
			cls.build_classifier(training_filtered)

			evl = Evaluation(training)
			evl.test_model(cls, testing_filtered)


			print colored("=> Testing for " + cls_classes[i], 'red')
			print(evl.summary())
			print(evl.matrix())
Exemple #16
0
jvm.start()

# load glass
fname = data_dir + os.sep + "glass.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate default J48
print("\nDefault J48")
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print(evl.summary())
print(evl.matrix())

# build and plot model
cls.build_classifier(data)
plg.plot_dot_graph(cls.graph)

# cross-validate unpruned J48 with larger leaf size
print("\nUnpruned J48 (minNumObj=15)")
cls = Classifier(classname="weka.classifiers.trees.J48",
                 options=["-U", "-M", "15"])
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print(evl.summary())
print(evl.matrix())

# build and plot model
Exemple #17
0
loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file("reviewsinformation_task2.arff")
iris_data.class_is_last()
loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file(iris_file)
iris_data.class_is_last()

# kernel classifier
helper.print_title("Creating SMO as KernelClassifier")
kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel",
                options=["-G", "0.001"])
classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                              options=["-M"])
classifier.kernel = kernel
classifier.build_classifier(iris_data)
print("classifier: " + classifier.to_commandline())
print("model:\n" + str(classifier))

#print("model:\n" + str(classifier))

evaluation = Evaluation('test_data.arff')
evaluation.crossvalidate_model(classifier,
                               diabetes_data,
                               10,
                               Random(42),
                               output=pred_output)
print(evaluation.summary())
print(evaluation.class_details())
print(evaluation.matrix())
Exemple #18
0
                                  sep=',',
                                  index=False)
    smote_test_data = convert.load_any_file(
        filename=GENERATED_SMOTE_TEST_DATA_FILE_PATH)
    smote_test_data.class_is_first()

    # load logistic model tree algorithm
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_smote_test_obj = Evaluation(smote_test_data)
    eval_smote_test_obj.crossvalidate_model(classifier=log_tree,
                                            data=smote_test_data,
                                            num_folds=5,
                                            rnd=Random(1))
    print("SMOTE Test CV (5-folds) Error = %.2f%%" %
          (eval_smote_test_obj.percent_incorrect))
    print(eval_smote_test_obj.matrix())
    print("=================\"Summary\"====================")
    print(eval_smote_test_obj.summary())

    log_tree.build_classifier(smote_test_data)
    y_predict = eval_smote_test_obj.test_model(log_tree, smote_test_data)

    y_test = to_binary_numeric(y_test.head(500), classNeg="neg")

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(
        y_test, y_predict)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,
             truePositiveRate,
    export_test_data.to_csv(GENERATED_TEST_DATA_FILE_PATH,
                            sep=',',
                            index=False)
    test_data = convert.load_any_file(filename=GENERATED_TEST_DATA_FILE_PATH)
    test_data.class_is_first()

    # load logistic model tree algorithm
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_test_obj = Evaluation(test_data)
    eval_test_obj.crossvalidate_model(classifier=log_tree,
                                      data=test_data,
                                      num_folds=5,
                                      rnd=Random(1))
    print("Test CV (10-folds) Error = %.2f%%" %
          (eval_test_obj.percent_incorrect))
    print(eval_test_obj.matrix())
    print("=================\"Summary\"====================")
    print(eval_test_obj.summary())

    log_tree.build_classifier(test_data)
    y_predict = eval_test_obj.test_model(log_tree, test_data)

    y_test = to_binary_numeric(y_test.head(500), classNeg="neg")

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_test,
                                                                y_predict,
                                                                pos_label=0)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,
Exemple #20
0
def ClassifyParam(website, mode, binWidths, truncation_modes=["full", "truncated"]):
	if not os.path.exists("classificationResults"):
		os.makedirs("classificationResults")

	
	if("normal" in mode):
		for truncation in truncation_modes:
			file = open("classificationResults/SingleWebsite_%s_%s.csv"%(truncation, website),"w")
			file.write("BinWidth, Accuracy, FalsePositiveRate, FalseNegativeRate\n")

			for binWidth in binWidths:

				train_set_file = "TrainSet_%s_%s.arff"%(truncation, binWidth)
				train_set = "Data/%s/arff/%s"%(website, train_set_file)
				test_set = "Data/%s/arff/%s"%(website, train_set_file.replace("TrainSet", "TestSet"))

				print "Loading Datasets..."
				print "Train: " + train_set
				train_data = converters.load_any_file(train_set)
				print "Test: " + test_set
				test_data = converters.load_any_file(test_set)
				
				#Set class attribute
				train_data.class_is_last()
				test_data.class_is_last()
				print "Dataset Loaded!"


				classifier_name = "weka.classifiers.meta.FilteredClassifier"

				classifier = Classifier(classname=classifier_name, options=[
					"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
					"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])

				start_train = time.time()
				classifier.build_classifier(train_data)
				end_train = time.time()
				print "Train\t%s\t%s"%(binWidth, end_train-start_train)

				for index, inst in enumerate(test_data):
					if(index == 0):
						start_sample = time.time()
						classifier.classify_instance(inst)
						end_sample = time.time()
						print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)

				print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
				evaluation = Evaluation(test_data)
				start_batch = time.time()
				evaluation.test_model(classifier, test_data)
				end_batch = time.time()
				print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)
				

				print evaluation.summary()
				print evaluation.matrix()
				#Just as an example, we're measuring the fpr and fnr of the website indexed as class 1

				tp = evaluation.num_true_positives(1)
				tn = evaluation.num_true_negatives(1)
				fp = evaluation.num_false_positives(1)
				fn = evaluation.num_false_negatives(1)

				acc = (tp+tn)/float(tp+tn+fp+fn)
				fpr = evaluation.false_positive_rate(1)
				fnr = evaluation.false_negative_rate(1)
				
				print "Accuracy: %s"%(acc)
				print "False Positive Rate: %s"%(fpr)
				print "False Negative Rate: %s"%(fnr)

				file.write("%s, %s, %s, %s\n"%(binWidth, acc, fpr, fnr))
			file.close()
Exemple #21
0
import weka.core.jvm as jvm
import weka.core.converters as conv
from weka.classifiers import Evaluation, Classifier
from weka.core.classes import Random
import weka.plot.classifiers as plcls  # NB: matplotlib is required
import os

data_dir = "/home/suruchi/Desktop/BTECH Pro/new/click_prediction/"

jvm.start(packages=True)
from weka.core.converters import Loader

loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(data_dir + "click_prediction.arff")
data.class_is_last()

#print(data)

cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 2, Random(5))

print(evl.summary("=== NaiveBayes on click prediction (stats) ===", False))
print(evl.matrix("=== NaiveBayes on click prediction(confusion matrix) ==="))
#plcls.plot_classifier_errors(evl.predictions, absolute=False,wait = True)
plcls.plot_roc(evl, class_index=[0, 1], wait=True)
print("areaUnderROC/1: " + str(evl.area_under_roc(1)))

jvm.stop()
Exemple #22
0
	X_train, y_train = load_data(ROOT_PATH + APS_TRAIN, skip_first_row=21, y_column_index=0,
								 assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True,
								 dropOrNot=True)
	export_train_data = pd.concat([y_train.head(500), X_train.head(500)], axis=1)

	# export data to csv
	export_train_data.to_csv(GENERATED_TRAIN_DATA_FILE_PATH, sep=',', index=False)
	train_data = convert.load_any_file(filename=GENERATED_TRAIN_DATA_FILE_PATH)
	train_data.class_is_first()

	# load logistic model tree algorithm
	log_tree = Classifier(classname="weka.classifiers.trees.LMT")
	eval_train_obj = Evaluation(train_data)
	eval_train_obj.crossvalidate_model(classifier=log_tree, data=train_data, num_folds=5, rnd=Random(1))
	print("Train CV (10-folds) Error = %.2f%%" % (eval_train_obj.percent_incorrect))
	print(eval_train_obj.matrix())
	print("=================\"Summary\"====================")
	print(eval_train_obj.summary())

	log_tree.build_classifier(train_data)
	y_predict = eval_train_obj.test_model(log_tree, train_data)

	# y_train = np.array(np.where(y_train.head(500).to_numpy() == 'neg', 0, 1))
	y_train = to_binary_numeric(y_train.head(500), classNeg="neg")

	falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train, y_predict, pos_label=0)
	# compute Area Under the Curve (AUC) using the trapezoidal rule
	area = auc(falsePositiveRate, truePositiveRate)

	plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area))
	plt.plot([0, 1], [0, 1], linestyle='dotted')
Exemple #23
0
def call_weka(file_dir, ml_opt, ofile_dir):

    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file_dir)
    data.class_is_last()
    filtered = data

    ml_id = ''
    if ml_opt != '0':
        if ml_opt == '1':
            classifier = Classifier(
                classname="weka.classifiers.functions.LibSVM",
                options=[
                    "-S", "0", "-K", "2", "-D", "3", "-G", "0.0", "-R", "0.0",
                    "-N", "0.5", "-M", "40.0", "-C", "1.0", "-E", "0.001",
                    "-P", "0.1", "-seed", "1"
                ])
            ml_id = 'SVM'
        elif ml_opt == '3':
            classifier = Classifier(
                classname="weka.classifiers.functions.MLPClassifier",
                options=[
                    '-N', '2', '-R', '0.01', '-O', '1.0E-6', '-P', '1', '-E',
                    '1', '-S', '1'
                ])
            ml_id = 'MLPC'
        elif ml_opt == '4':
            classifier = Classifier(
                classname="weka.classifiers.trees.RandomForest",
                options=["-I", "100", "-K", "0", "-S", "1", "-num-slots", "1"])
            ml_id = 'RF'
        elif ml_opt == '2':
            classifier = Classifier(classname="weka.classifiers.meta.Bagging",
                                    options=[
                                        "-P", "100", "-S", "1", "-I", "10",
                                        "-W", "weka.classifiers.trees.M5P",
                                        "--", "-M", "4.0"
                                    ])
            ml_id = 'BagM5P'
        elif ml_opt == '5':
            classifier = Classifier(classname="weka.classifiers.trees.J48",
                                    options=["-C", "0.25", "-M", "2"])
            ml_id = 'J48'
        elif ml_opt == '7':
            classifier = Classifier(
                classname="weka.classifiers.functions.RBFNetwork",
                options=[
                    "-B", "2", "-S", "1", "-R", "1.0E-8", "-M", "-1", "-W",
                    "0.1"
                ])
            ml_id = 'RBFNet'
        elif ml_opt == '8':
            classifier = Classifier(
                classname="weka.classifiers.bayes.BayesNet",
                options=[
                    "-D", "-Q", "weka.classifiers.bayes.net.search.local.K2",
                    "--", "-P", "1", "-S", "BAYES", "-E",
                    "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                    "--", "-A", "0.5"
                ])
            ml_id = 'BayesNet'
        elif ml_opt == '6':
            classifier = Classifier(
                classname="weka.classifiers.bayes.NaiveBayes")
            ml_id = 'NaiveBayes'
        elif ml_opt == '9':
            classifier = Classifier(
                classname="weka.classifiers.functions.SimpleLogistic",
                options=["-I", "0", "-M", "500", "-H", "50", "-W", "0.0"])
            ml_id = 'LogReg'
        filtered.class_is_last()
        evaluation = Evaluation(filtered)
        evaluation.crossvalidate_model(classifier, filtered, 10, Random(42))
        print "Evaluation: Done."

        ofile = open(ofile_dir + ml_id + "_results.txt", 'wb')

        print >> ofile, evaluation.summary()
        print >> ofile, evaluation.class_details().encode('ascii', 'ignore')
        print >> ofile, evaluation.matrix().encode('ascii', 'ignore')
        serialization.write(ofile_dir + ml_id + ".model", classifier)
        print "Saving " + ml_id + " Model: Done."

        ofile.close()
def experiment_sequential_file(path_indices, path_features,
                               path_folder_save_results, options, classifier,
                               name):
    ind_f = load(path_indices)

    lst = ind_f.files

    for item in lst:
        ind = ind_f[item] + 1

    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    data = converters.load_any_file(path_features)

    ind = np.append(ind, len(data))

    data.class_is_last()

    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")

    d_results = {
        'index': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }

    for j in range(len(ind) - 1):
        print(j)

        print(str(ind[j]) + '-' + str(ind[j + 1]))

        d_test = data.subset(row_range=str(ind[j]) + '-' + str(ind[j + 1]))

        if j == 0:  # first
            d_train = data.subset(row_range=str(ind[j + 1] + 1) + '-' +
                                  str(ind[-1]))  # last element
        elif j == len(ind) - 2:  # last
            d_train = data.subset(row_range='1-' +
                                  str(ind[j] - 1))  # last element
        else:  # central
            s = '1-' + str(ind[j] - 1) + ',' + str(ind[j + 1] + 1) + '-' + str(
                ind[-1])
            d_train = data.subset(row_range=s)

        cls.build_classifier(d_train)

        evl = Evaluation(data)
        evl.test_model(cls, d_test, pout)

        save = pout.buffer_content()

        with open(
                path_folder_save_results + '/' + '/prediction/' + name +
                str(j) + 'pred_data.csv', 'w') as f:
            f.write(save)

        d_results['index'].append(str(ind[j]))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv',
                     index=False)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline,
                                  classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                                  options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", types.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48",
                            options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0,
                                         Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(
        classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText",
        options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier,
                                   diabetes_data,
                                   10,
                                   Random(42),
                                   output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " +
          str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " +
          str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " +
          str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " +
          str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " +
          str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " +
          str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " +
          str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " +
          str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " +
          str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " +
          str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " +
          str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " +
          str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " +
          str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(evaluation,
                      title="ROC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)
    plot_cls.plot_prc(evaluation,
                      title="PRC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)

    # train 2nd classifier on diabetes dataset
    classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
    evaluation2 = Evaluation(diabetes_data)
    evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42))
    plot_cls.plot_rocs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="ROC diabetes",
                       class_index=0,
                       wait=False)
    plot_cls.plot_prcs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="PRC diabetes",
                       class_index=0,
                       wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(
            str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # train 2nd classifier and show errors in same plot
    classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg")
    evaluation2 = Evaluation(bolts_data)
    evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42))
    plot_cls.plot_classifier_errors(
        {
            "LR": evaluation.predictions,
            "SMOreg": evaluation2.predictions
        },
        wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    ]
    plot_cls.plot_learning_curve(cls,
                                 diabetes_data,
                                 increments=0.05,
                                 label_template="[#] !",
                                 metric="percent_correct",
                                 wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in xrange(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
train = converters.load_any_file("imbalanced_train.arff")
test = converters.load_any_file("imbalanced_test.arff")

train.class_is_last()
test.class_is_last()

# Setting the number of iterations performed by Logit Boost
cls = Classifier(classname="weka.classifiers.trees.LMT", options=["-B", "-I", "10"])

# 5 Fold Cross Validation Error
evl = Evaluation(train)
evl.crossvalidate_model(cls, train, 5, Random(1))

# Prints Out Confusion Matrix along with other summary statistics
print("LMT (imbalanced classes) CV = 5 Error: %.2f%%" % (evl.percent_incorrect))
print(evl.matrix()) #Confusion Matrix

# Plots ROC
plcls.plot_roc(evl, class_index=[0, 1], wait=True)

# Extra Summary
print(evl.summary())
print(evl.class_details())

# Evaluate the classifier on test set
cls.build_classifier(train)
tevl = Evaluation(test)
tevl.test_model(cls, test)

# Prints Out Confusion Matrix along with other summary statistics
print("LMT (imbalanced classes) Test Error: %.2f%%" % (tevl.percent_incorrect))
trainData = loader.load_file('segment-challenge.arff')
trainData.class_is_last()
testData = loader.load_file('segment-test.arff')
testData.class_is_last()

# Default C4.5 tree
classifier = Classifier(classname="weka.classifiers.trees.J48")

# Search for the best parameters and build a classifier with them
classifier.build_classifier(trainData)

print("\n\n=========== Classifier information ================\n\n")
print(classifier.options)
print(classifier)

print("\n\n=========== Train results ================\n\n")
evaluation = Evaluation(trainData)
evaluation.test_model(classifier, trainData)
print(classifier.to_commandline())
print(evaluation.matrix())
print("Train recognition: %0.2f%%" % evaluation.percent_correct)

print("\n\n=========== Test results ================\n\n")
evaluation = Evaluation(testData)
evaluation.test_model(classifier, testData)
print(classifier.to_commandline())
print(evaluation.matrix())
print("Test recognition: %0.2f%%" % evaluation.percent_correct)

jvm.stop()
Exemple #28
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.class_is_last()

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.classifier = Classifier(classname=classifier)
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct)
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.matrix("Matrix:"))

jvm.stop()
Exemple #29
0
data_dir = os.environ.get("WEKAMOOC_DATA")
if data_dir is None:
    data_dir = "." + os.sep + "data"

import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.classifiers import Classifier, Evaluation, PredictionOutput
from weka.core.classes import Random
import weka.plot.classifiers as plc

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate NaiveBayes
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1), pout)
print(evl.summary())
print(evl.matrix())
print(pout)
plc.plot_roc(evl, wait=True)

jvm.stop()
                                   sep=',',
                                   index=False)
    smote_train_data = convert.load_any_file(
        filename=GENERATED_SMOTE_TRAIN_DATA_FILE_PATH)
    smote_train_data.class_is_first()

    # load logistic model tree algorithm
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_smote_train_obj = Evaluation(smote_train_data)
    eval_smote_train_obj.crossvalidate_model(classifier=log_tree,
                                             data=smote_train_data,
                                             num_folds=5,
                                             rnd=Random(1))
    print("SMOTE Train CV (5-folds) Error = %.2f%%" %
          (eval_smote_train_obj.percent_incorrect))
    print(eval_smote_train_obj.matrix())
    print("=================\"Summary\"====================")
    print(eval_smote_train_obj.summary())

    log_tree.build_classifier(smote_train_data)
    y_predict = eval_smote_train_obj.test_model(log_tree, smote_train_data)

    y_train_smote = to_binary_numeric(y_train_smote, classNeg="neg")

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train_smote,
                                                                y_predict,
                                                                pos_label=0)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,