def f_smote():
    jvm.start()

    train_data, test_data = b_i_impute_data()

    train_data = train_data[:10000]
    y_train = train_data["class"]
    x_train = train_data.drop("class", axis=1)

    sm = SMOTE(ratio="minority")
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns)
    y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"])
    train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1)
    print_f("smote train data shape", train_data_sm_df.shape)
    train_data_sm_df.to_csv("./train_data_sm.csv", index=False)

    train_data_sm = converters.load_any_file("train_data_sm.csv")
    train_data_sm.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print_f("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print_f("bulding classifier")
    cls.build_classifier(train_data_sm)
    print_f("Evaluating")
    evl = Evaluation(train_data_sm)

    evl.crossvalidate_model(cls, train_data_sm, 5, Random(1))
    print_f("Train Accuracy:", evl.percent_correct)
    print_f("Train summary")
    print_f(evl.summary())
    print_f("Train class details")
    print_f(evl.class_details())
    print_f("Train confusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl,
                   class_index=[0, 1],
                   wait=True,
                   outfile="./plots/2_f_smote_10k.png")
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)

    evl = Evaluation(test_data)
    print_f("testing model")
    evl.test_model(cls, test_data)
    print_f("Test Accuracy:", evl.percent_correct)
    print_f("Test summary")
    print_f(evl.summary())
    print_f(" Testclass details")
    print_f(evl.class_details())
    print_f("Testconfusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/f_test_roc_curve.png")
Exemple #2
0
    def run_naive_bayes_crossval(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "NB Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Naive_Bayes_Crossval", resultsString,
                          output_directory)
def Boost_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Boost_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False)
    value_Boost_J48 = str(evaluation.percent_correct)
    return value_Boost_J48
def RandomTree(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RT_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RT_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RT_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False)
    value_RT = str(evl.percent_correct)
    return value_RT
Exemple #5
0
def runSMO(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])

    cls = KernelClassifier(
        classname="weka.classifiers.functions.SMO",
        options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"])
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.PolyKernel",
        options=["-C", "250007", "-E", "1.0"])
    cls.kernel = kernel
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)

    #print(pout.buffer_content())

    print(evl.percent_correct)
    #print(evl.summary())

    result = evl.class_details()
    print(result)
    return result
Exemple #6
0
def run_bayesNet(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running BayesNet on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # Use BayesNet and set options
    cls = Classifier(classname="weka.classifiers.bayes.BayesNet",
                     options=[
                         "-D", "-Q",
                         "weka.classifiers.bayes.net.search.local.TAN", "--",
                         "-P", "1", "-S", "BAYES", "-E",
                         "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                         "--", "-A", "0.5"
                     ])

    # Predictions stored in pout
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    # Evaluate data
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout)

    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.confusion_matrix)

    # Generate grid for ROC
    # plcls.plot_roc(evaluation, class_index=[0,1], wait=True)

    # mk dirs for output
    dir = dir / "bayesNet_results"
    dir.mkdir(parents=True, exist_ok=True)

    # Save summary, class details and confusion matrix to file
    result_output = filename_base + "_bayesNet_eval_results_TAN.txt"
    output_eval(evaluation, dir / result_output)

    # Save the predicited results to file
    prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt"
    output_pred(pout, dir / prediction_output)

    print("BayesNet complete")
def e_model_tree():
    # train_data, test_data = b_i_impute_data()
    # train_data.to_csv("./train_data.csv", index=False)
    # test_data.to_csv("./test_data.csv",index=False)

    jvm.start()
    train_data = converters.load_any_file("train_data.csv")
    train_data.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print("2")
    cls.build_classifier(train_data)

    print("3")
    evl = Evaluation(train_data)
    evl.crossvalidate_model(cls, train_data, 5, Random(1))
    print("Train Accuracy:", evl.percent_correct)
    print("Train summary")
    print(evl.summary())
    print("Train class details")
    print(evl.class_details())
    print("Train confusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_train_roc_curve.png")

    evl = Evaluation(test_data)
    evl.test_model(cls, test_data)
    print("Test Accuracy:", evl.percent_correct)
    print("Test summary")
    print(evl.summary())
    print(" Testclass details")
    print(evl.class_details())
    print("Testconfusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_test_roc_curve.png")
Exemple #8
0
def obtainSVM(file):
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    data.class_is_last()

    classifier = Classifier(classname="weka.classifiers.functions.LibSVM")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
    def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]):
        
        if arrfFile is not None:
            self.initData( arrfFile )
            
        if self.data is None:
            return 

        print 'Classificador ' + str(classname) + ' ' + ' '.join(options)
        cls = Classifier(classname=classname, options=options)
        
        evl = Evaluation(self.data)
        evl.crossvalidate_model(cls, self.data, 10, Random(1))

        print(evl.percent_correct)
        print(evl.summary())
        print(evl.class_details())
Exemple #10
0
    def run_bayes_hill_split(self, output_directory, parents=1):
        # build classifier
        print("\nBuilding Bayes Classifier on training data. Parents = " +
              str(parents) + "\n")
        buildTimeStart = time.time()
        cls = Classifier(
            classname="weka.classifiers.bayes.BayesNet",
            options=[
                "-D", "-Q",
                "weka.classifiers.bayes.net.search.local.HillClimber", "--",
                "-P", "" + str(parents), "-S", "BAYES", "-E",
                "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--",
                "-A", "0.5"
            ])
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "Bayes Split Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nEvaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.test_model(cls, self.testing_data)

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nBayes Split Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Bayes_Hill_P" + str(parents) + "_", resultsString,
                          output_directory)
        self.save_results("Bayes_Hill_P" + str(parents) + "_Graph", cls.graph,
                          output_directory, True)
    def cross_validate(self, detail = True):
        """Perform cross validation using trained data. 
        
        Parameters
        ----------
        detail : boolean, optional, default = True
            If true return a detailed information of cross validation.
            
        Returns
        -------
        info : string
            Info with results of cross validation.
        """
        
        #print 'cross_validation'
        
        start_time = TimeUtils.get_time()
        
        info =  "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options]))
        
        if detail == True:
            info += "Relation:\t%s\n" % (self.data.relationname)
            info += "Instances:\t%d\n" % (self.data.num_instances)
            info += "Attributes:\t%d\n\n" % (self.data.num_attributes)
        
        evl = WEvaluation(self.data)
        evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1))
        
        if detail == False:
            info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)

        info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)
        #info += str(evl.percent_correct) + "\n\n"
        
        if detail == True:
            info += "=== Stratified cross-validation ===\n"
            info += evl.summary() + "\n\n"
            
            info += str(evl.class_details()) + "\n\n"
            
            classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)]
            cm = evl.confusion_matrix
            info += Classifier.confusion_matrix(classes, cm)

        return info
Exemple #12
0
    def run_crossval(self, output_directory, classifier_name,
                     classifier_weka_spec, options_list):
        # build classifier
        print("\nBuilding " + classifier_name +
              " Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname=classifier_weka_spec, options=options_list)
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = classifier_name + " Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\n\n" + classifier_name + " Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        options_string = ""
        for option in options_list:
            options_string = options_string + str(option)

        options_string = options_string.replace(".", "-")
        options_string = options_string.replace("-", "_")
        #Save Results and Cleanup
        self.save_results(classifier_name + options_string + "_Crossval",
                          resultsString, output_directory)
Exemple #13
0
    def crossValidate(self,
                      arrfFile=None,
                      classname="weka.classifiers.trees.J48",
                      options=["-C", "0.3"]):

        if arrfFile is not None:
            self.initData(arrfFile)

        if self.data is None:
            return

        print 'Classificador ' + str(classname) + ' ' + ' '.join(options)
        cls = Classifier(classname=classname, options=options)

        evl = Evaluation(self.data)
        evl.crossvalidate_model(cls, self.data, 10, Random(1))

        print(evl.percent_correct)
        print(evl.summary())
        print(evl.class_details())
Exemple #14
0
def runBayes(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])
    cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1))

    print(evl.percent_correct)
    #print(evl.summary())
    result = evl.class_details()
    print(result)
    return result
def TrainingModel(arff, modelOutput, clsfier):
    # 启动java虚拟机
    jvm.start()
    # 导入训练集
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(arff)
    train.class_is_first()
    # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高
    cls_name = "weka.classifiers." + clsfier
    clsf = Classifier(classname=cls_name)
    clsf.build_classifier(train)
    print(clsf)
    # 建立模型
    fc = FilteredClassifier()
    fc.classifier = clsf
    evl = Evaluation(train)
    evl.crossvalidate_model(fc, train, 10, Random(1))
    print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    print(evl.matrix())
    # 结果统计
    matrixResults = evl.confusion_matrix
    TN = float(matrixResults[0][0])
    FP = float(matrixResults[0][1])
    FN = float(matrixResults[1][0])
    TP = float(matrixResults[1][1])
    TPR = TP / (TP + FN)
    TNR = TN / (FP + TN)
    PPV = TP / (TP + FP)
    NPV = TN / (TN + FN)
    print("算法: " + clsfier)
    print("敏感度 TPR: " + str(TPR))
    print("特异度 TNR: " + str(TNR))
    print("PPV: " + str(PPV))
    print("NPV: " + str(NPV))
    # 保存模型
    clsf.serialize(modelOutput, header=train)
    # 退出虚拟机
    jvm.stop()
    print("分析模型建立完成")
Exemple #16
0
    def run_ibk_crossval(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(
            classname="weka.classifiers.lazy.IBk",
            options=[
                "-K", "3", "-W", "0", "-A",
                "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.EuclideanDistance -R first-last\""
            ])
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "IBK Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nIBK Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("IBK_Crossval", resultsString, output_directory)
Exemple #17
0
def obtainBayesNet(file):
    #The path of the arff extension file must be put.
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")

    #In the case of this specific data set, the first two attributes were removed since they
    #   represent the name and ranking which are unique values that would affect the classification.
    #   Depending on the data set, certain attributes must be removed.
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    #It is specified that the class value is the last attribute.
    data.class_is_last()

    #Define the classifier to be used.
    classifier = Classifier(classname="weka.classifiers.bayes.BayesNet")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    #The ROC-AUC is extracted from the string that is received from Weka.
    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
Exemple #18
0
def train(training_dataset_path, model_cache_file_name, evaluation_is_on,
          summary_file_path):
    """Model Training function

    The function uses the WEKA machine learning library, implemented by
    python-weka-wrapper Python library. Divides the data into given
    folds, and do the training and evaluation. Trained model copied to __predictors global variable
    and also saved (together with training data set) to the model_cache_file_name file. Evaluation summary is being written to summary_file_path file.

    Args:
        :param training_dataset_path: the path of the input arff file.
        :param model_cache_file_name:
        :param evaluation_is_on: run evaluation after training (true / false)
        :param summary_file_path: the path of the model evaluation summary file.

    Returns:
        None
    """

    global __classifiers
    global __predictors

    training_data = converters.load_any_file(training_dataset_path)
    training_data.class_is_last()

    lines = []
    summaries = []
    summary_line = [
        'Model'.ljust(16), 'Precision'.ljust(12), 'Recall'.ljust(12),
        'F-measure'.ljust(12), 'Accuracy'.ljust(12), 'FPR'.ljust(12)
    ]
    summaries.append('\t'.join(summary_line))

    for classifier, option_str in __classifiers.items():
        option_list = re.findall(r'"(?:[^"]+)"|(?:[^ ]+)', option_str)
        option_list = [s.replace('"', '') for s in option_list]

        classifier_name = classifier.split('.')[-1]
        info_str = "Using classifier: {classifier}, options: {options}".format(
            classifier=classifier_name, options=str(option_list))
        localizer_log.msg(info_str)
        lines.append(info_str)

        # Train
        cls = Classifier(classname=classifier, options=option_list)
        localizer_log.msg("Start building classifier")
        cls.build_classifier(training_data)
        localizer_log.msg("Completed building classifier")
        localizer_log.msg("Saving trained model to {model_cache_name}".format(
            model_cache_name=model_cache_file_name))

        # localizer_config.save_model(cls, training_data, model_cache_file_name)
        path = os.path.join('caches', 'model')
        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)
        path = os.path.join(path, model_cache_file_name + '.cache')
        cls.serialize(path)
        localizer_log.msg("Trained model saved")

        classifier2, _ = Classifier.deserialize(path)
        print(classifier2)

        __predictors[classifier_name] = cls

        if evaluation_is_on:

            # Model evaluation
            localizer_log.msg("Start evaluation classifier")
            evl = Evaluation(training_data)
            localizer_log.msg("Complete evaluation classifier")

            localizer_log.msg("Start cross-validating classifier")
            evl.crossvalidate_model(cls, training_data, 10, Random(1))
            localizer_log.msg("Complete cross-validating classifier")

            # print(evl.percent_correct)
            # print(evl.summary())
            # print(evl.class_details())

            lines.append(evl.summary())
            lines.append(evl.class_details())

            summary_line = []
            summary_line.append(classifier_name.ljust(16))
            summary_line.append("{:.3f}".format(evl.weighted_precision *
                                                100).ljust(12))
            summary_line.append("{:.3f}".format(evl.weighted_recall *
                                                100).ljust(12))
            summary_line.append("{:.3f}".format(evl.weighted_f_measure *
                                                100).ljust(12))
            summary_line.append("{:.3f}".format(evl.percent_correct).ljust(12))
            summary_line.append("{:.3f}".format(
                evl.weighted_false_positive_rate * 100).ljust(12))
            summaries.append('\t'.join(summary_line))

            # Save evaluation summary to file
            with open(summary_file_path, 'w') as f:
                f.writelines('\n'.join(lines))
                f.writelines('\n' * 5)
                f.writelines('\n'.join(summaries))
Exemple #19
0
def run_multilayerPercepton(file, file2=None):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running Multilayer Percepton on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    print("loading data...")
    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # If 2nd file load that data too
    if file2:
        print("Loading test...")
        test = load_Arff_file(file2)
        test.class_is_first()

    file_names = [
        "MP_N-500_default_H-1",
        "MP_N-500_H-3",
        "MP_N-500_H-5",
        "MP_N-500_H-7",
        "MP_N-500_H-3-5",
        "MP_N-500_H-5-3",
        "MP_N-500_H-3-5-7",
        "MP_N-500_H-7-3-5",
        "MP_N-500_H-5-7-3",
        "MP_N-500_L-01",
        "MP_N-500_L-02",
        "MP_N-500_L-04",
        "MP_N-500_L-05",
        "MP_N-500_M-01",
        "MP_N-500_M-03",
        "MP_N-500_M-04",
        "MP_N-500_M-05",
        "MP_N-500_E-5",
        "MP_N-500_E-10",
        "MP_N-500_E-15",
        "MP_N-500_E-25",
    ]

    options_list = [
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # DEFAULT
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "3"
        ],  # -H START
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "5"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "7"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "3, 5"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "5, 3"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "3, 5, 7"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "7, 3, 5"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "5, 7, 3"
        ],  # -H END
        [
            "-L", "0.1", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -L START
        [
            "-L", "0.2", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.4", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.5", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -L END
        [
            "-L", "0.3", "-M", "0.1", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -M START
        [
            "-L", "0.3", "-M", "0.3", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.4", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.5", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -M END
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "5", "-H", "1"
        ],  # -E START
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "10", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "15", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "25", "-H", "1"
        ],  # -E END
    ]

    for i in range(len(options_list)):
        start = time.time()
        print("Beginning iteration " + str(i) + ": " + file_names[i])

        # Use MultilayerPercepton and set options
        cls = Classifier(
            classname="weka.classifiers.functions.MultilayerPerceptron",
            options=options_list[i])
        # Build classifier with train data
        cls.build_classifier(data)

        # Predictions stored in pout
        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.PlainText"
        )

        # Evaluate data on test data
        evaluation = Evaluation(data)
        evaluation.test_model(cls, test, output=pout)

        print(evaluation.summary())
        print(evaluation.class_details())
        print(evaluation.confusion_matrix)

        # Generate grid for ROC
        # plcls.plot_roc(evaluation, class_index=[0,1], wait=True)

        # mk dirs for output
        tempdir = dir / "Results/" / "MP-ALL_N-500_results/" / (file_names[i] +
                                                                "_results/")
        tempdir.mkdir(parents=True, exist_ok=True)

        # Save summary, class details and confusion matrix to file
        result_output = file_names[i] + "_results.txt"
        print(tempdir)
        print(result_output)
        print((tempdir / result_output).absolute())
        output_eval(evaluation, tempdir / result_output)

        # Save the predicited results to file
        prediction_output = file_names[i] + "_prediction.txt"
        output_pred(pout, tempdir / prediction_output)

        end = time.time()
        timetaken = round(end - start, 2)
        print("Time taken to run iteration " + str(i) + ": %s seconds" %
              (timetaken))

    print("Multilayer Percepton complete")
from weka.classifiers import Classifier
import csv
from weka.classifiers import Evaluation
from weka.core.classes import Random

data = pd.read_csv("data2.csv")
data = shuffle(input)

cls = Classifier(classname="weka.classifiers.trees.LMT", options=["-C", "0.3"])
cls.build_classifier(data)

evl = Evaluation(data)
evl.crossvalidate_model(fc, data, 10, Random(1))

print(evl.percent_correct)
print(evl.summary())
print(evl.class_details())
Exemple #21
0
loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file("reviewsinformation_task2.arff")
iris_data.class_is_last()
loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file(iris_file)
iris_data.class_is_last()

# kernel classifier
helper.print_title("Creating SMO as KernelClassifier")
kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel",
                options=["-G", "0.001"])
classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                              options=["-M"])
classifier.kernel = kernel
classifier.build_classifier(iris_data)
print("classifier: " + classifier.to_commandline())
print("model:\n" + str(classifier))

#print("model:\n" + str(classifier))

evaluation = Evaluation('test_data.arff')
evaluation.crossvalidate_model(classifier,
                               diabetes_data,
                               10,
                               Random(42),
                               output=pred_output)
print(evaluation.summary())
print(evaluation.class_details())
print(evaluation.matrix())
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline,
                                  classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                                  options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", types.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48",
                            options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0,
                                         Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(
        classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText",
        options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier,
                                   diabetes_data,
                                   10,
                                   Random(42),
                                   output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " +
          str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " +
          str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " +
          str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " +
          str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " +
          str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " +
          str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " +
          str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " +
          str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " +
          str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " +
          str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " +
          str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " +
          str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " +
          str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(evaluation,
                      title="ROC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)
    plot_cls.plot_prc(evaluation,
                      title="PRC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)

    # train 2nd classifier on diabetes dataset
    classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
    evaluation2 = Evaluation(diabetes_data)
    evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42))
    plot_cls.plot_rocs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="ROC diabetes",
                       class_index=0,
                       wait=False)
    plot_cls.plot_prcs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="PRC diabetes",
                       class_index=0,
                       wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(
            str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # train 2nd classifier and show errors in same plot
    classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg")
    evaluation2 = Evaluation(bolts_data)
    evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42))
    plot_cls.plot_classifier_errors(
        {
            "LR": evaluation.predictions,
            "SMOreg": evaluation2.predictions
        },
        wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    ]
    plot_cls.plot_learning_curve(cls,
                                 diabetes_data,
                                 increments=0.05,
                                 label_template="[#] !",
                                 metric="percent_correct",
                                 wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in xrange(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
def createTrainedModel():
    from weka.core.converters import Loader
    folderList = os.listdir(outputModel)
    i = 0
    classi = ""
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(os.path.join(outputModel, "genderTrain.arff"))
    data.class_is_last()
    from weka.classifiers import Classifier
    classi = "weka.classifiers.bayes.NaiveBayes"
    cls = Classifier(classname=classi)
    from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-1.7976931348623157E308", "-1"])
    #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval")
    #attsel = AttributeSelection()
    #attsel.search(search)
    #attsel.evaluator(evaluator)
    #attsel.select_attributes(data)
    cls.build_classifier(data)
    import weka.core.serialization as serialization
    from weka.core.dataset import Instances
    serialization.write_all(
        os.path.join(outputModel, "GenderModel" + ".model"),
        [cls, Instances.template_instances(data)])
    from weka.classifiers import Evaluation
    from weka.core.classes import Random
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print "Gender model predictions"
    print cls
    #print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())

    data = loader.load_file(os.path.join(outputModel, "ageTrain.arff"))
    data.class_is_last()
    classi = "weka.classifiers.bayes.NaiveBayes"
    cls = Classifier(classname=classi)
    from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-1.7976931348623157E308", "-1"])
    #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval")
    #attsel = AttributeSelection()
    #attsel.search(search)
    #attsel.evaluator(evaluator)
    #attsel.select_attributes(data)
    #classi = "weka.classifiers.trees.J48"
    #classi = "weka.classifiers.functions.Logistic"
    #classi = "weka.classifiers.trees.RandomForest"
    #classi = "weka.classifiers.bayes.NaiveBayes"
    #classi = "weka.classifiers.functions.SMOreg"
    cls.build_classifier(data)
    print "Age model predictions"
    print cls
    import weka.core.serialization as serialization
    from weka.core.dataset import Instances
    serialization.write_all(os.path.join(outputModel, "AgeModel" + ".model"),
                            [cls, Instances.template_instances(data)])
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))

    #print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    os._exit(0)
Exemple #24
0
def call_weka(file_dir, ml_opt, ofile_dir):

    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file_dir)
    data.class_is_last()
    filtered = data

    ml_id = ''
    if ml_opt != '0':
        if ml_opt == '1':
            classifier = Classifier(
                classname="weka.classifiers.functions.LibSVM",
                options=[
                    "-S", "0", "-K", "2", "-D", "3", "-G", "0.0", "-R", "0.0",
                    "-N", "0.5", "-M", "40.0", "-C", "1.0", "-E", "0.001",
                    "-P", "0.1", "-seed", "1"
                ])
            ml_id = 'SVM'
        elif ml_opt == '3':
            classifier = Classifier(
                classname="weka.classifiers.functions.MLPClassifier",
                options=[
                    '-N', '2', '-R', '0.01', '-O', '1.0E-6', '-P', '1', '-E',
                    '1', '-S', '1'
                ])
            ml_id = 'MLPC'
        elif ml_opt == '4':
            classifier = Classifier(
                classname="weka.classifiers.trees.RandomForest",
                options=["-I", "100", "-K", "0", "-S", "1", "-num-slots", "1"])
            ml_id = 'RF'
        elif ml_opt == '2':
            classifier = Classifier(classname="weka.classifiers.meta.Bagging",
                                    options=[
                                        "-P", "100", "-S", "1", "-I", "10",
                                        "-W", "weka.classifiers.trees.M5P",
                                        "--", "-M", "4.0"
                                    ])
            ml_id = 'BagM5P'
        elif ml_opt == '5':
            classifier = Classifier(classname="weka.classifiers.trees.J48",
                                    options=["-C", "0.25", "-M", "2"])
            ml_id = 'J48'
        elif ml_opt == '7':
            classifier = Classifier(
                classname="weka.classifiers.functions.RBFNetwork",
                options=[
                    "-B", "2", "-S", "1", "-R", "1.0E-8", "-M", "-1", "-W",
                    "0.1"
                ])
            ml_id = 'RBFNet'
        elif ml_opt == '8':
            classifier = Classifier(
                classname="weka.classifiers.bayes.BayesNet",
                options=[
                    "-D", "-Q", "weka.classifiers.bayes.net.search.local.K2",
                    "--", "-P", "1", "-S", "BAYES", "-E",
                    "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                    "--", "-A", "0.5"
                ])
            ml_id = 'BayesNet'
        elif ml_opt == '6':
            classifier = Classifier(
                classname="weka.classifiers.bayes.NaiveBayes")
            ml_id = 'NaiveBayes'
        elif ml_opt == '9':
            classifier = Classifier(
                classname="weka.classifiers.functions.SimpleLogistic",
                options=["-I", "0", "-M", "500", "-H", "50", "-W", "0.0"])
            ml_id = 'LogReg'
        filtered.class_is_last()
        evaluation = Evaluation(filtered)
        evaluation.crossvalidate_model(classifier, filtered, 10, Random(42))
        print "Evaluation: Done."

        ofile = open(ofile_dir + ml_id + "_results.txt", 'wb')

        print >> ofile, evaluation.summary()
        print >> ofile, evaluation.class_details().encode('ascii', 'ignore')
        print >> ofile, evaluation.matrix().encode('ascii', 'ignore')
        serialization.write(ofile_dir + ml_id + ".model", classifier)
        print "Saving " + ml_id + " Model: Done."

        ofile.close()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
dataSet50x20 = loader.load_file("trainingSet/dataSet50x20.arff")
dataSet50x20.class_is_last()

classifier1 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"])
classifier2 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "11"])
classifier3 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"])

print "\n\nTraining neural network 1"
evaluation1 = Evaluation(dataSet20x20)                   
evaluation1.crossvalidate_model(classifier1, dataSet20x20, 10, Random(42))
classifier1.build_classifier(dataSet20x20)
serialization.write("trainingSet/nn1.model", classifier1)
print "\n\n====================================================== NUERAL NETWORK 1 ======================================================"
print(evaluation1.summary())
print(evaluation1.class_details())

print "Training neural network 2"
evaluation2 = Evaluation(dataSet20x50) 
evaluation2.crossvalidate_model(classifier2, dataSet20x50, 10, Random(42))
classifier2.build_classifier(dataSet20x50)
serialization.write("trainingSet/nn2.model", classifier2)
print "\n\n====================================================== NUERAL NETWORK 2 ======================================================"
print(evaluation2.summary())
print(evaluation2.class_details())

print "Training neural network 3"
evaluation3 = Evaluation(dataSet50x20) 
evaluation3.crossvalidate_model(classifier3, dataSet50x20, 10, Random(42))
classifier3.build_classifier(dataSet50x20)
serialization.write("trainingSet/nn3.model", classifier3)
Exemple #27
0
from weka.filters import Filter
# convert csv into arff format (weka compatable)
# use convertcsvtoarff.py file

# load arff file

loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file("reviewsinformation_task2.arff")
iris_data.class_is_last()
loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file(iris_file)
iris_data.class_is_last()

# kernel classifier
helper.print_title("Creating SMO as KernelClassifier")
kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
classifier.kernel = kernel
classifier.build_classifier(iris_data)
print("classifier: " + classifier.to_commandline())
print("model:\n" + str(classifier))

#print("model:\n" + str(classifier))


evaluation = Evaluation('test_data.arff')
evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
print(evaluation.summary())
print(evaluation.class_details())
print(evaluation.matrix())