コード例 #1
0
def vote_classifier_train(dicrectory, nameOfDataSet, flag):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(dicrectory)
    data.class_is_last()
    meta = MultipleClassifiersCombiner(
        classname="weka.classifiers.meta.Vote",
        options=[
            '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B',
            'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG'
        ])
    eval = Evaluation(data)
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    if flag:
        eval.crossvalidate_model(meta, data, 10, Random(1), pout)
    else:
        eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout)
    gc.collect()
    print_and_save('Proposed model', flag, nameOfDataSet, eval)
コード例 #2
0
ファイル: utils.py プロジェクト: Unkrible/NFS
    def exposed_evaluate(self, X, d, task, i_model, i_evl):
        data = np.reshape(eval(X), [d, -1], order='C')
        if task == 'regression':
            if i_model == 'LR':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.functions.LinearRegression')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'RF':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            if i_evl == 'mae':
                r_mae = evl.mean_absolute_error
                return r_mae
            elif i_evl == 'mse':
                r_mae = evl.mean_square_error
                return r_mse
            elif i_evl == '1-rae':
                r_one_minus_rae = 1 - evl.relative_absolute_error / 100
                del evl, model, data
                return r_one_minus_rae

        elif task == 'classification':
            le = LabelEncoder()
            data[:, -1] = le.fit_transform(data[:, -1])
            if i_model == 'RF':
                dataRaw = converters.ndarray_to_instances(data, relation='tmp')
                weka_filter = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NumericToNominal",
                    options=["-R", "last"])
                weka_filter.inputformat(dataRaw)
                data = weka_filter.filter(dataRaw)
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'LR':
                model = LogisticRegression(multi_class='ovr')
            elif i_model == 'SVM':
                model = svm.SVC()
            if i_evl == 'f_score':
                fscore = evl.weighted_f_measure
                del evl, model, data, dataRaw
                if not (fscore >= 0.01 and fscore < 1.01):
                    fscore = 0.01
                return fscore
コード例 #3
0
def naive_bayse(dicrectory, nameOfDataSet, flag):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(dicrectory)
    data.class_is_last()
    cls = Classifier(classname='weka.classifiers.bayes.NaiveBayes')
    eval = Evaluation(data)
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    if flag:
        eval.crossvalidate_model(cls, data, 10, Random(1), pout)
    else:
        eval.evaluate_train_test_split(cls, data, 80.0, Random(1), pout)
    print_and_save('Naive Bayes model', flag, nameOfDataSet, eval)
    gc.collect()
def Boost_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Boost_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False)
    value_Boost_J48 = str(evaluation.percent_correct)
    return value_Boost_J48
def RandomTree(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RT_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RT_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RT_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False)
    value_RT = str(evl.percent_correct)
    return value_RT
コード例 #6
0
def runSMO(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])

    cls = KernelClassifier(
        classname="weka.classifiers.functions.SMO",
        options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"])
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.PolyKernel",
        options=["-C", "250007", "-E", "1.0"])
    cls.kernel = kernel
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)

    #print(pout.buffer_content())

    print(evl.percent_correct)
    #print(evl.summary())

    result = evl.class_details()
    print(result)
    return result
コード例 #7
0
def create_model(input_file, output_file):
    # Load data
    data = converters.load_any_file(input_file)
    data.class_is_last()  # set class attribute

    # filter data
    print_title("Filtering Data")
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10", "-M", "-1.0", "-R", "first-last"])
    discretize.inputformat(
        data)  # let the filter know about the type of data to filter
    filtered_data = discretize.filter(data)
    print("Done! (believe it or not)")

    print_title("Build Classifier")
    classifier = Classifier(classname="weka.classifiers.trees.RandomForest",
                            options=["-I", "100", "-K", "0", "-S", "1"])
    classifier.build_classifier(filtered_data)
    print("Done! (believe it or not)")
    serialization.write_all(output_file, [classifier, discretize])
    print("Model and filter saved to ", output_file)

    evaluation = Evaluation(data)  # initialize with priors
    evaluation.crossvalidate_model(classifier, filtered_data, 10,
                                   Random(42))  # 10-fold CV
    print(evaluation.summary())
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
コード例 #8
0
ファイル: common_defs.py プロジェクト: silvianunes/hyperband
def train_and_eval_weka_classifier(clf, train, valid, n_instances):

    total_train_inst = train.num_instances

    percentage = (n_instances * 100) / total_train_inst

    if percentage == 100:
        opt = train
    else:
        opt, residual = train.train_test_split(percentage, Random(1))

    # opt, residual = train.train_test_split(percentage, Random(1))

    print('total_train_inst:    ', total_train_inst, '| percentage:    ',
          percentage, '| used_inst:     ', opt.num_instances)

    clf.build_classifier(opt)

    evl = Evaluation(opt)
    evl.test_model(clf, valid)
    # evl.crossvalidate_model(clf, opt, 10, Random(1))

    acc = evl.percent_correct
    auc = evl.weighted_area_under_roc
    err = evl.error_rate
    log = evl.sf_mean_scheme_entropy

    print(
        "# validating  | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}"
        .format(log, acc, auc, err))

    return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
コード例 #9
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
コード例 #10
0
ファイル: assignment.py プロジェクト: Qisen25/Data-Mining
def naiveBayes(data):
	
	classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"])
	nfolds=13
	rnd = Random(0)
	evaluation = Evaluation(data)
	evaluation.crossvalidate_model(classifier, data,
	nfolds, rnd)
	print(" Naive Bayes Cross-validation information")
	print(evaluation.summary())
	print("precision: " + str(evaluation.precision(1)))
	print("recall: " + str(evaluation.recall(1)))
	print("F-measure: " + str(evaluation.f_measure(1)))
	print("==confusion matrix==")
	print("     a     b")
	print(evaluation.confusion_matrix)
	print
	#write to file
	f = open("naiveeval.txt", "w")
	f.write(evaluation.summary()) 
	f.write("\n")
	f.write("==confusion matrix==\n")
	f.write("     a       b\n")
	for item in evaluation.confusion_matrix:
		f.write("%s\n" % item)
	f.close() 
	#plot roc graph
	plcls.plot_roc(evaluation, title="Naive Bayes ROC", outfile="NBROC", wait=True)
	
	return evaluation.percent_correct
コード例 #11
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )
    data_arff.class_is_last()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
コード例 #12
0
def main():
    """
    Shows how to use the CostSensitiveClassifier.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.CostSensitiveClassifier",
        options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"])
    base = Classifier(classname="weka.classifiers.trees.J48",
                      options=["-C", "0.3"])
    classifier.classifier = base

    folds = 10
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, folds, Random(1))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("")
    print(
        evaluation.summary("=== " + str(folds) +
                           " -fold Cross-Validation ==="))
コード例 #13
0
def experiment_file_random(path_features, path_folder_save_results, options,
                           classifier, fold, random, name):
    print("start weka")
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))
    d_results = {
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }
    data = converters.load_any_file(path_features)
    data.class_is_last()
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, fold, Random(random), pout)
    d_results['percent_correct'].append(evl.percent_correct)
    d_results['percent_incorrect'].append(evl.percent_incorrect)
    d_results['confusion_matrix'].append(
        evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv',
                     index=False)

    save = pout.buffer_content()

    with open(
            path_folder_save_results + '/' + 'prediction/' + str(name) +
            '.csv', 'w') as f:
        f.write(save)
def splitTrainSet(data,m_numLabledData=10) :

    total = data.num_instances
    labeled_amount = int(m_numLabledData * total / 100)
    unlabeled_amount = total - labeled_amount

    rand = Random(1)
    data.randomize(rand)

    labledDataSet = Instances.create_instances(data.relationname,data.attributes(),labeled_amount)
    UnlabledDataSet = Instances.create_instances(data.relationname,data.attributes(),unlabeled_amount)

    

    for i in range(labeled_amount) :
        labledDataSet.add_instance(data.get_instance(i))

    labledDataSet.randomize(rand)

    for i in range(unlabeled_amount) :
        UnlabledDataSet.add_instance(data.get_instance(labeled_amount + i))


    # labledDataSet.randomize(rand)
    labledDataSet.class_is_last()

    # UnlabledDataSet.randomize(rand)
    UnlabledDataSet.class_is_last()


    return labledDataSet,UnlabledDataSet
コード例 #15
0
    def evaluation(self, classifier, trainingData, testingData=None):
        trainingData.set_class_index(trainingData.num_attributes() - 1)
        if testingData == None:
            evaluation = Evaluation(trainingData)
            # initialize with priors
            evaluation.crossvalidate_model(classifier, trainingData, 10,
                                           Random(42))  # 10-fold CV
            return evaluation
        else:
            print "testing data exists"
            if testingData.num_attributes() == trainingData.num_attributes():
                testingData.set_class_index(testingData.num_attributes() - 1)
                evaluation = Evaluation(trainingData)

                classifier.build_classifier(trainingData)
                evaluation.test_model(classifier, testingData)

                #for attribute in trainingData.attributes():
                #    print "train:" + str(attribute)
                #for attribute in testingData.attributes():
                #    print "test:" + str(attribute)

                return evaluation
            else:
                print "testing Data doesn't have same attribute with training data"
                for attribute in trainingData.attributes():
                    print "train:" + str(attribute)
                for attribute in testingData.attributes():
                    print "test:" + str(attribute)
コード例 #16
0
    def test_generate_thresholdcurve_data(self):
        """
        Tests the generate_thresholdcurve_data method.
        """
        loader = converters.Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(self.datafile("diabetes.arff"))
        data.class_is_last()

        remove = filters.Filter(
            classname="weka.filters.unsupervised.attribute.Remove",
            options=["-R", "1-3"])
        cls = classifiers.Classifier(
            classname="weka.classifiers.bayes.NaiveBayes")
        fc = classifiers.FilteredClassifier()
        fc.filter = remove
        fc.classifier = cls

        evl = classifiers.Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(1))
        data = plot.generate_thresholdcurve_data(evl, 0)
        self.assertEqual(13,
                         data.num_attributes,
                         msg="number of attributes differs")
        self.assertEqual(769, data.num_instances, msg="number of rows differs")
        attname = "True Positives"
        self.assertIsNotNone(data.attribute_by_name(attname),
                             msg="Failed to locate attribute: " + attname)
        attname = "False Positive Rate"
        self.assertIsNotNone(data.attribute_by_name(attname),
                             msg="Failed to locate attribute: " + attname)
        attname = "Lift"
        self.assertIsNotNone(data.attribute_by_name(attname),
                             msg="Failed to locate attribute: " + attname)
コード例 #17
0
def naivebay_classifier_weka(data):
    classifier = Classifier("weka.classifiers.bayes.NaiveBayes")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(42))
    print(evaluation.summary())
    print(evaluation.confusion_matrix)
    return classifier
コード例 #18
0
ファイル: wekaExperiments.py プロジェクト: aascode/DDV
    def runCV(this, arffFile, classifier, folds):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(arffFile)
        data.class_is_last()

        classes = [str(code) for code in data.class_attribute.values]
        header = ["Accuracy"]
        for name in classes:
            header += [name + " TP", name + " FP", name + " AUC ROC"]
        values = []

        cls = Classifier(classname=classifier)

        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, folds, Random(1))

        values.append(evl.percent_correct)
        for name in classes:
            index = classes.index(name)
            values += [
                evl.true_positive_rate(index) * 100,
                evl.false_positive_rate(index) * 100,
                evl.area_under_roc(index)
            ]

        this.values = values
        this.header = header
コード例 #19
0
def main():

    try:
        jvm.start()

        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file("./data/adult.csv")

        data.class_is_last()  # set class attribute

        # randomize data
        folds = k
        seed = 1
        rnd = Random(seed)
        rand_data = Instances.copy_instances(data)
        rand_data.randomize(rnd)
        if rand_data.class_attribute.is_nominal:
            rand_data.stratify(folds)

        NaiveBayes(rand_data, folds, seed, data)
        DecisionTree(rand_data, folds, seed, data)
    except Exception as e:
        raise e
    finally:
        jvm.stop()
コード例 #20
0
ファイル: load_data.py プロジェクト: silvianunes/hyperband
    def split_train_test_valid(self):
        try:
            self.data = self.return_data()
            total_inst = self.data.num_instances
            train_, self.test = self.data.train_test_split(80.0, Random(1))
            self.train, self.valid = train_.train_test_split(75.0, Random(1))

            print('total_inst:  ', total_inst, '| train_inst: ',
                  self.train.num_instances, '| valid_inst: ',
                  self.valid.num_instances, '| test_inst: ',
                  self.test.num_instances)

        except Exception:
            pass

        return self.train, self.valid, self.test
コード例 #21
0
def main():

    dataset = sys.argv[1]
    #load a dataset
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file("./data/" + dataset + ".arff")
    data.class_is_last()

    num_classes = data.class_attribute.num_values

    os.mkdir('resultados_' + sys.argv[1])
    for random_cv in range(10):  #10 CV

        # generate train/test split of randomized data
        train, test = data.train_test_split(75.0, Random(random_cv))
        results_train, results_test = classification(data, train, test,
                                                     num_classes)
        #        results_test = classification(test, num_classes)

        #Write results in Excel format
        train_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[
            1] + "_" + "E" + np.str(random_cv) + ".csv"
        test_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[
            1] + "_" + "T" + np.str(random_cv) + ".csv"

        results_train.to_csv(train_name)
        results_test.to_csv(test_name)
コード例 #22
0
ファイル: ANN.py プロジェクト: EskerOn/AIProject
def fitness(toeval : Individual):
    cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings())
    fc = FilteredClassifier()
    fc.filter = remove
    fc.classifier = cls
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, 10, Random(1))
    return evl.percent_correct
コード例 #23
0
def f_smote():
    jvm.start()

    train_data, test_data = b_i_impute_data()

    train_data = train_data[:10000]
    y_train = train_data["class"]
    x_train = train_data.drop("class", axis=1)

    sm = SMOTE(ratio="minority")
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns)
    y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"])
    train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1)
    print_f("smote train data shape", train_data_sm_df.shape)
    train_data_sm_df.to_csv("./train_data_sm.csv", index=False)

    train_data_sm = converters.load_any_file("train_data_sm.csv")
    train_data_sm.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print_f("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print_f("bulding classifier")
    cls.build_classifier(train_data_sm)
    print_f("Evaluating")
    evl = Evaluation(train_data_sm)

    evl.crossvalidate_model(cls, train_data_sm, 5, Random(1))
    print_f("Train Accuracy:", evl.percent_correct)
    print_f("Train summary")
    print_f(evl.summary())
    print_f("Train class details")
    print_f(evl.class_details())
    print_f("Train confusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl,
                   class_index=[0, 1],
                   wait=True,
                   outfile="./plots/2_f_smote_10k.png")
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)

    evl = Evaluation(test_data)
    print_f("testing model")
    evl.test_model(cls, test_data)
    print_f("Test Accuracy:", evl.percent_correct)
    print_f("Test summary")
    print_f(evl.summary())
    print_f(" Testclass details")
    print_f(evl.class_details())
    print_f("Testconfusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/f_test_roc_curve.png")
コード例 #24
0
def run_bayesNet(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running BayesNet on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # Use BayesNet and set options
    cls = Classifier(classname="weka.classifiers.bayes.BayesNet",
                     options=[
                         "-D", "-Q",
                         "weka.classifiers.bayes.net.search.local.TAN", "--",
                         "-P", "1", "-S", "BAYES", "-E",
                         "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                         "--", "-A", "0.5"
                     ])

    # Predictions stored in pout
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    # Evaluate data
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout)

    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.confusion_matrix)

    # Generate grid for ROC
    # plcls.plot_roc(evaluation, class_index=[0,1], wait=True)

    # mk dirs for output
    dir = dir / "bayesNet_results"
    dir.mkdir(parents=True, exist_ok=True)

    # Save summary, class details and confusion matrix to file
    result_output = filename_base + "_bayesNet_eval_results_TAN.txt"
    output_eval(evaluation, dir / result_output)

    # Save the predicited results to file
    prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt"
    output_pred(pout, dir / prediction_output)

    print("BayesNet complete")
コード例 #25
0
    def get_weka_training_data(self):
        percentage_of_train_set = 100 - self.test_size * 100
        loader = Loader(classname="weka.core.converters.CSVLoader")
        dataset = loader.load_file(os.path.join(constants.BASE_DIR, constants.BREAST_CANCER_FILE_NAME))
        dataset.class_is_last()
        train_set, test_set = dataset.train_test_split(percentage_of_train_set, Random(1))

        return {
            'train_set': train_set,
            'test_set': test_set,
            'labels': dataset.class_attribute.values
        }
コード例 #26
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    The predictions get recorded in two different ways:
    1. in-memory via the test_model method
    2. directly to file (more memory efficient), but a separate run of making predictions

    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate and record predictions in memory
    helper.print_title("recording predictions in-memory")
    output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV",
        options=["-distribution"])
    evl = Evaluation(train)
    evl.test_model(cls, test, output=output)
    print(evl.summary())
    helper.print_info("Predictions:")
    print(output.buffer_content())

    # record/output predictions separately
    helper.print_title("recording/outputting predictions separately")
    outputfile = helper.get_tmp_dir() + "/j48_vote.csv"
    output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV",
        options=["-distribution", "-suppress", "-file", outputfile])
    output.header = test
    output.print_all(cls, test)
    helper.print_info("Predictions stored in:" + outputfile)
    # by using "-suppress" we don't store the output in memory, the following statement won't output anything
    print(output.buffer_content())
コード例 #27
0
def main():
    """
    Just runs some example code.
    """

    # generic JavaObject stuff
    helper.print_title("Generic stuff using weka.core.SystemInfo")
    info = JavaObject(JavaObject.new_instance("weka.core.SystemInfo"))
    jwrapper = info.jwrapper
    print("toString() method:")
    print(jwrapper.toString())

    # random
    helper.print_title("Random")
    rnd = Random(1)
    for i in xrange(10):
        print(rnd.next_double())
    for i in xrange(10):
        print(rnd.next_int(100))

    # single index
    helper.print_title("SingleIndex")
    si = SingleIndex(index="first")
    upper = 10
    si.upper(upper)
    print(str(si) + " (upper=" + str(upper) + ")\n -> " + str(si.index()))
    si.single_index = "3"
    si.upper(upper)
    print(str(si) + " (upper=" + str(upper) + ")\n -> " + str(si.index()))
    si.single_index = "last"
    si.upper(upper)
    print(str(si) + " (upper=" + str(upper) + ")\n -> " + str(si.index()))

    # range
    helper.print_title("Range")
    rng = Range(ranges="first")
    upper = 10
    invert = False
    rng.upper(upper)
    rng.invert = invert
    print(str(rng.ranges) + " (upper=" + str(upper) + ", invert=" + str(invert) + ")\n -> " + str(rng.selection()))
    rng.ranges = "3"
    rng.upper(upper)
    rng.invert = invert
    print(str(rng.ranges) + " (upper=" + str(upper) + ", invert=" + str(invert) + ")\n -> " + str(rng.selection()))
    rng.ranges = "last"
    rng.upper(upper)
    rng.invert = invert
    print(str(rng.ranges) + " (upper=" + str(upper) + ", invert=" + str(invert) + ")\n -> " + str(rng.selection()))
    rng.ranges = "first-last"
    rng.upper(upper)
    rng.invert = invert
    print(str(rng.ranges) + " (upper=" + str(upper) + ", invert=" + str(invert) + ")\n -> " + str(rng.selection()))
    rng.ranges = "3,4,7-last"
    rng.upper(upper)
    rng.invert = invert
    print(str(rng.ranges) + " (upper=" + str(upper) + ", invert=" + str(invert) + ")\n -> " + str(rng.selection()))
    rng.ranges = "3,4,7-last"
    rng.upper(upper)
    invert = True
    rng.invert = invert
    print(str(rng.ranges) + " (upper=" + str(upper) + ", invert=" + str(invert) + ")\n -> " + str(rng.selection()))

    # tag
    helper.print_title("Tag")
    tag = Tag(ident=1, ident_str="one")
    print("tag=" + str(tag) + ", ident=" + str(tag.ident) + ", readable=" + tag.readable)
    tag.ident = 3
    print("tag=" + str(tag) + ", ident=" + str(tag.ident) + ", readable=" + tag.readable)
    tag = Tag(ident=2, ident_str="two", readable="2nd tag")
    print("tag=" + str(tag) + ", ident=" + str(tag.ident) + ", readable=" + tag.readable)