Exemple #1
0
def runSMO(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])

    cls = KernelClassifier(
        classname="weka.classifiers.functions.SMO",
        options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"])
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.PolyKernel",
        options=["-C", "250007", "-E", "1.0"])
    cls.kernel = kernel
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)

    #print(pout.buffer_content())

    print(evl.percent_correct)
    #print(evl.summary())

    result = evl.class_details()
    print(result)
    return result
Exemple #2
0
def create_model(input_file, output_file):
    # Load data
    data = converters.load_any_file(input_file)
    data.class_is_last()  # set class attribute

    # filter data
    print_title("Filtering Data")
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10", "-M", "-1.0", "-R", "first-last"])
    discretize.inputformat(
        data)  # let the filter know about the type of data to filter
    filtered_data = discretize.filter(data)
    print("Done! (believe it or not)")

    print_title("Build Classifier")
    classifier = Classifier(classname="weka.classifiers.trees.RandomForest",
                            options=["-I", "100", "-K", "0", "-S", "1"])
    classifier.build_classifier(filtered_data)
    print("Done! (believe it or not)")
    serialization.write_all(output_file, [classifier, discretize])
    print("Model and filter saved to ", output_file)

    evaluation = Evaluation(data)  # initialize with priors
    evaluation.crossvalidate_model(classifier, filtered_data, 10,
                                   Random(42))  # 10-fold CV
    print(evaluation.summary())
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
def affective_vectorizer(tweets, filename):
    '''
    Vectorizes the tweets and saves the vectors as csv.
    :param tweets: list of tweets
    :param filename: name of the saved file
    '''
    jvm.start(packages=True)
    install_package('AffectiveTweets')

    data = dataset.create_instances_from_lists([[t] for t in tweets])

    filter = Filter(
        classname=
        'weka.filters.unsupervised.attribute.TweetToLexiconFeatureVector',
        options=[
            '-F', '-D', '-R', '-A', '-T', '-L', '-N', '-P', '-J', '-H', '-Q',
            '-stemmer', 'weka.core.stemmers.NullStemmer', '-stopwords-handler',
            'weka.core.tokenizers.TweetNLPTokenizer', '-I', '1', '-U',
            '-tokenizer', 'weka.core.tokenizers.TweetNLPTokenizer'
        ])
    filter.inputformat(data)
    filtered_data = filter.filter(data)

    converters.save_any_file(filtered_data, 'data/affect-vectors/' + filename)

    jvm.stop()
Exemple #4
0
def Feature_Selection(infile):
    directory = os.getcwd() + '/'
    csvpath = directory + infile

    jvm.start(packages=True, max_heap_size="4g")
    print "\n\n"
    print "Loaded file: ", infile
    csvloader = Loader(classname="weka.core.converters.CSVLoader")
    csvdata = csvloader.load_file(csvpath)

    remover = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", " 1"])
    remover.inputformat(csvdata)
    filtered_data = remover.filter(csvdata)
    filtered_data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "-E", "1"])
    attribs = AttributeSelection()
    attribs.search(search)
    attribs.evaluator(evaluator)
    attribs.select_attributes(filtered_data)
    print "Summary of Attribute Selection: "
    print attribs.results_string
    jvm.stop()
    return
Exemple #5
0
def run(dataset_path):
    start = time.time()

    ### load a dataset ###
    train_data = model.load_dataset_weka(dataset_path)  #
    to_nomial_class_filter = Filter(
        classname="weka.filters.unsupervised.attribute.NumericToNominal",
        options=["-R", "last"])
    to_nomial_class_filter.inputformat(train_data)

    ###  Naive Bayes ### Choose what you want
    classifier = Classifier("weka.classifiers.bayes.NaiveBayesMultinomial")
    # classifier = Classifier("weka.classifiers.bayes.NaiveBayes")
    # classifier.build_classifer(train_data)
    evaluation = Evaluation(to_nomial_class_filter.filter(train_data))
    evaluation.crossvalidate_model(classifier,
                                   to_nomial_class_filter.filter(train_data),
                                   10, Random(42))
    # print(evaluation.summary())
    # print(evaluation.class_details())
    # print(evaluation.matrix())

    # ###  Naive Bayes ###
    # mlp = Classifier("weka.classifiers.bayes.Naive Bayes")
    # mlp.build_classifer(train_file_5EMO)

    print(time.time() - start)
Exemple #6
0
def supFilters(data, fType, ops):

	filt = Filter(classname="weka.filters.supervised." + fType, options = ops)
	filt.inputformat(data)     # let the filter know about the type of data to filter
	filtered = filt.filter(data)   # filter the data
	
	return filtered
Exemple #7
0
def discretize_data(input_data):
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10"])
    discretize.inputformat(input_data)
    filtered_data = discretize.filter(input_data)
    return filtered_data
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris)

    # remove class attribute
    helper.print_info("Removing class attribute")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "last"])
    remove.inputformat(data)
    filtered = remove.filter(data)

    # use MultiFilter
    helper.print_info("Use MultiFilter")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "first"])
    std = Filter(classname="weka.filters.unsupervised.attribute.Standardize")
    multi = MultiFilter()
    multi.filters = [remove, std]
    multi.inputformat(data)
    filtered_multi = multi.filter(data)

    # output datasets
    helper.print_title("Input")
    print(data)
    helper.print_title("Output")
    print(filtered)
    helper.print_title("Output (MultiFilter)")
    print(filtered_multi)

    # load text dataset
    text = helper.get_data_dir(
    ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff"
    helper.print_info("Loading dataset: " + text)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(text)
    data.class_is_last()

    # apply StringToWordVector
    stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer")
    stopwords = Stopwords(classname="weka.core.stopwords.Rainbow")
    tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
    s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"])
    s2wv.stemmer = stemmer
    s2wv.stopwords = stopwords
    s2wv.tokenizer = tokenizer
    s2wv.inputformat(data)
    filtered = s2wv.filter(data)

    helper.print_title("Input (StringToWordVector)")
    print(data)
    helper.print_title("Output (StringToWordVector)")
    print(filtered)
Exemple #9
0
def smote(data, percentage):
    sampler = Filter(
        classname='weka.filters.supervised.instance.SMOTE',
        options=["-C", "0", "-K", "5", "-P",
                 str(percentage), "-S", "1"])
    sampler.inputformat(data)
    newData = sampler.filter(data)
    return newData
Exemple #10
0
def discretize(data, index, file):
    discretizer = Filter(
        classname='weka.filters.supervised.attribute.Discretize',
        options=["-R", str(index), "-precision", "6"])
    discretizer.inputformat(data)
    newData = discretizer.filter(data)
    discretizer.serialize(file)
    return newData
Exemple #11
0
def remove(data, indecies, file):
    cmdIndex = ','.join(indecies)
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", cmdIndex])
    remove.inputformat(data)
    newData = remove.filter(data)
    remove.serialize(file)
    return newData
Exemple #12
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)
    helper.print_info("Evaluating on data")
    evaluation = ClusterEvaluation()
    evaluation.set_model(clusterer)
    evaluation.test_model(data)
    print("# clusters: " + str(evaluation.num_clusters))
    print("log likelihood: " + str(evaluation.log_likelihood))
    print("cluster assignments:\n" + str(evaluation.cluster_assignments))
    plc.plot_cluster_assignments(evaluation, data, inst_no=True)

    # using a filtered clusterer
    helper.print_title("Filtered clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    fclusterer = FilteredClusterer()
    fclusterer.clusterer = clusterer
    fclusterer.filter = remove
    fclusterer.build_clusterer(data)
    print(fclusterer)

    # load a dataset incrementally and build clusterer incrementally
    helper.print_title("Incremental clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    clusterer = Clusterer("weka.clusterers.Cobweb")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    remove.inputformat(iris_inc)
    iris_filtered = remove.outputformat()
    clusterer.build_clusterer(iris_filtered)
    for inst in loader:
        remove.input(inst)
        inst_filtered = remove.output()
        clusterer.update_clusterer(inst_filtered)
    clusterer.update_finished()
    print(clusterer.to_commandline())
    print(clusterer)
    print(clusterer.graph)
    plg.plot_dot_graph(clusterer.graph)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)
    helper.print_info("Evaluating on data")
    evaluation = ClusterEvaluation()
    evaluation.set_model(clusterer)
    evaluation.test_model(data)
    print("# clusters: " + str(evaluation.num_clusters))
    print("log likelihood: " + str(evaluation.log_likelihood))
    print("cluster assignments:\n" + str(evaluation.cluster_assignments))
    plc.plot_cluster_assignments(evaluation, data, inst_no=True)

    # using a filtered clusterer
    helper.print_title("Filtered clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    fclusterer = FilteredClusterer()
    fclusterer.clusterer = clusterer
    fclusterer.filter = remove
    fclusterer.build_clusterer(data)
    print(fclusterer)

    # load a dataset incrementally and build clusterer incrementally
    helper.print_title("Incremental clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    clusterer = Clusterer("weka.clusterers.Cobweb")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    remove.inputformat(iris_inc)
    iris_filtered = remove.outputformat()
    clusterer.build_clusterer(iris_filtered)
    for inst in loader:
        remove.input(inst)
        inst_filtered = remove.output()
        clusterer.update_clusterer(inst_filtered)
    clusterer.update_finished()
    print(clusterer.to_commandline())
    print(clusterer)
    print(clusterer.graph)
    plg.plot_dot_graph(clusterer.graph)
Exemple #14
0
def stringToNominal(data, indecies, file):
    cmdIndex = ','.join(indecies)
    stn = Filter(
        classname="weka.filters.unsupervised.attribute.StringToNominal",
        options=["-R", cmdIndex])
    stn.inputformat(data)
    newData = stn.filter(data)
    stn.serialize(file)
    return newData
Exemple #15
0
def undersample(data, percentage):
    if (percentage >= 100):
        return None
    sampler = Filter(classname='weka.filters.supervised.instance.Resample',
                     options=["-B", "1.0", "-S", "1", "-Z",
                              str(percentage)])
    sampler.inputformat(data)
    newData = sampler.filter(data)
    return newData
Exemple #16
0
    def exposed_evaluate(self, X, d, task, i_model, i_evl):
        data = np.reshape(eval(X), [d, -1], order='C')
        if task == 'regression':
            if i_model == 'LR':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.functions.LinearRegression')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'RF':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            if i_evl == 'mae':
                r_mae = evl.mean_absolute_error
                return r_mae
            elif i_evl == 'mse':
                r_mae = evl.mean_square_error
                return r_mse
            elif i_evl == '1-rae':
                r_one_minus_rae = 1 - evl.relative_absolute_error / 100
                del evl, model, data
                return r_one_minus_rae

        elif task == 'classification':
            le = LabelEncoder()
            data[:, -1] = le.fit_transform(data[:, -1])
            if i_model == 'RF':
                dataRaw = converters.ndarray_to_instances(data, relation='tmp')
                weka_filter = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NumericToNominal",
                    options=["-R", "last"])
                weka_filter.inputformat(dataRaw)
                data = weka_filter.filter(dataRaw)
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'LR':
                model = LogisticRegression(multi_class='ovr')
            elif i_model == 'SVM':
                model = svm.SVC()
            if i_evl == 'f_score':
                fscore = evl.weighted_f_measure
                del evl, model, data, dataRaw
                if not (fscore >= 0.01 and fscore < 1.01):
                    fscore = 0.01
                return fscore
Exemple #17
0
def merge_classes(data, idx_to_merge):
    """
    :param data: The data file to filter
    :param idx_to_merge: String representation of class indices to merge 
    :return: filtered data
    """
    merge_filter = Filter(
        classname="weka.filters.unsupervised.attribute.MergeManyValues",
        options=["-C", "last", "-R", idx_to_merge, "-unset-class-temporarily"])
    merge_filter.inputformat(data)
    filtered_data = merge_filter.filter(data)
    return filtered_data
Exemple #18
0
	def remove_correct_classified(self, invert = False):
		options=[
			'-W', self.classifier.to_commandline(), 
			'-C', str(self.class_index), #classindex
	#		'-F','0', # folds
	#		'-T','0.1', #threshold by numeric classes
			'-I','0', # max iterations
			'-V' if not invert else '' 
		] # invert
		classname = "weka.filters.unsupervised.instance.RemoveMisclassified"
		remove = Filter(classname=classname, options=options)
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)
Exemple #19
0
 def filter_data(self, data):
     print("Filtering Data..\n")
     flter = Filter(
         classname="weka.filters.supervised.attribute.AttributeSelection")
     aseval = ASEvaluation(
         classname="weka.attributeSelection.CfsSubsetEval",
         options=["-P", "1", "-E", "1"])
     assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                         options=["-D", "1", "-N", "5"])
     flter.set_property("evaluator", aseval.jobject)
     flter.set_property("search", assearch.jobject)
     flter.inputformat(data)
     filtered = flter.filter(data)
     return filtered
def unsupervised_discretize(data):
    """
    Function for discretization of data. Function uses weka implementation
    weka.filters.unsupervised.attribute.Discretize.

    :param data: weka arff data
    :return: weka arff data
    """
    args, _sufix = unsupervised_discretize_parser()

    filt = Filter(classname='weka.filters.unsupervised.attribute.Discretize',
                  options=args_to_weka_options(args, _sufix))
    filt.inputformat(data)
    return filt.filter(data)
    def _get_training_dataset(self, X, y):
        # convert to numpy array
        if isinstance(X, pd.DataFrame):
            X = X.values
        elif isinstance(X, list):
            X = np.array(X)
        elif not isinstance(X, np.ndarray):
            raise Exception("Incompatible data type: {}".format(type(X)))
        if isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, list):
            y = np.array(y)
        elif not isinstance(y, np.ndarray):
            raise Exception("Incompatible data type: {}".format(type(y)))

        if y.dtype == "O":
            for i in range(0, len(y)):
                try:
                    y[i] = y[i].encode()
                except:
                    pass
        dataset = create_instances_from_matrices(
            X, y, name="generated from matrices")  # generate dataset

        # convert label to nominal
        try:
            y.astype(float)
            self._label_type = np.float64
            nominal = Filter(
                classname=
                "weka.filters.unsupervised.attribute.NumericToNominal",
                options=["-R", "last"])
        except ValueError:
            self._label_type = str
            nominal = Filter(
                classname="weka.filters.unsupervised.attribute.StringToNominal",
                options=["-R", "last"])
        nominal.inputformat(dataset)
        dataset = nominal.filter(dataset)

        # sort labels
        sorter = Filter(
            classname="weka.filters.unsupervised.attribute.SortLabels")
        sorter.inputformat(dataset)
        dataset = sorter.filter(dataset)

        dataset.class_is_last()  # indicate class label

        return dataset
def use_filter(data):
    """
    Uses the AttributeSelection filter for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n2. Filter")
    flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    flter.set_property("evaluator", aseval.jobject)
    flter.set_property("search", assearch.jobject)
    flter.inputformat(data)
    filtered = flter.filter(data)
    print(str(filtered))
def use_filter(data):
    """
    Uses the AttributeSelection filter for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n2. Filter")
    flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    flter.set_property("evaluator", aseval.jobject)
    flter.set_property("search", assearch.jobject)
    flter.inputformat(data)
    filtered = flter.filter(data)
    print(str(filtered))
Exemple #24
0
def obtainSVM(file):
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    data.class_is_last()

    classifier = Classifier(classname="weka.classifiers.functions.LibSVM")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
Exemple #25
0
 def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None):
     BustersAgent.__init__(self, index, inference, ghostAgents)
     self.previousDistances = [0,0,0,0]
     jvm.start(max_heap_size="512m")
     self.loader = Loader(classname="weka.core.converters.ArffLoader")
     self.data = self.loader.load_file("data/game_toCluster.arff")
     self.data.delete_last_attribute()
     self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"])
     self.clusterer.build_clusterer(self.data)
     self.inst = ""
     self.data = self.loader.load_file("data/game_toCluster.arff")
     addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"])
     addCluster.inputformat(self.data)
     filtered = addCluster.filter(self.data)
     self.f = open('data/addCluster.arff', 'w+')
     self.f.write(str(filtered))
     self.clustered_data = self.classifyData('data/addCluster.arff')
Exemple #26
0
    def load(path, db):
        nominals = [
            49,  # dev_global_mem_cache_type
            52,  # dev_host_unified_memory
            54,  # dev_local_mem_type
            56,  # dev_type
            57,  # dev_vendor
        ]
        nominal_indices = ",".join([str(index) for index in nominals])
        force_nominal = ["-N", nominal_indices]

        # Load data from CSV.
        dataset = Dataset.load_csv(path, options=force_nominal)
        dataset.__class__ = Dataset

        # Set class index and database connection.
        dataset.class_index = -1
        dataset.db = db

        # Create string->nominal type attribute filter, ignoring the first
        # attribute (scenario ID), since we're not classifying with it.
        string_to_nominal = WekaFilter(
            classname=("weka.filters.unsupervised."
                       "attribute.StringToNominal"),
            options=["-R", "2-last"],
        )
        string_to_nominal.inputformat(dataset.instances)

        # Create filtered dataset, and swap data around.
        filtered = string_to_nominal.filter(dataset.instances)

        # Create nominal->binary type attribute filter, ignoring the
        # first attribute (scenario ID), since we're not classifying with it.
        n2b = WekaFilter(
            classname="weka.filters.unsupervised.attribute.NominalToBinary",
            options=["-R", "2-last"],
        )
        n2b.inputformat(filtered)

        dataset.instances = n2b.filter(filtered)

        return dataset
Exemple #27
0
def get_rule_covering_inst(classifier, data, inst_idx):
    """
    Finds the rule in a learned JRIP model that covers an instance
    :param classifier: trained JRIP model
    :param data: weka dataset
    :param inst_idx: instance ID to find corresponding rule of
    """
    merge_filter = Filter(
        classname="weka.filters.supervised.attribute.ClassOrder",
        options=["-C", "0"])
    merge_filter.inputformat(data)
    ordered_data = merge_filter.filter(data)

    rset = classifier.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        if r.covers(data.get_instance(inst_idx).jobject):
            print("Instance is covered by current rule:",
                  str(r.toString(ordered_data.class_attribute.jobject)))
            break
Exemple #28
0
def runBayes(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])
    cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1))

    print(evl.percent_correct)
    #print(evl.summary())
    result = evl.class_details()
    print(result)
    return result
Exemple #29
0
def make_partition(data, attributes, part='normal'):

    if part == 'normal':
        value = 'last'
    elif part == 'anomalous':
        value = 'first'

    keep_normal = Filter(
        classname='weka.filters.unsupervised.instance.RemoveWithValues',
        options=['-C', 'last', '-L', value])
    keep_normal.inputformat(data)
    data_normal = keep_normal.filter(data)

    remove = Filter(classname='weka.filters.unsupervised.attribute.Remove',
                    options=['-R', 'last'])
    remove.inputformat(data)
    data_normal = remove.filter(data_normal)

    N = data_normal.num_instances

    return data_normal, N
Exemple #30
0
def weka_bayesnet(filearffpath='data/datatobayes.arff'):
    """Simple calling of the bayesian network from python.
    """
    #Preparing the data
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file('data/datatobayes.arff')
    #data = loader.load_file('data/Full.arff')
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "first"])
    remove.inputformat(data)
    filtered = data  #remove.filter(data)

    #Classifier test
    from weka.classifiers import Classifier, Evaluation
    from weka.core.classes import Random
    filtered.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.BayesNet",
                            options=['-D'])  #
    evaluation = Evaluation(filtered)
    evaluation.crossvalidate_model(classifier, filtered, 10, Random(42))
    return evaluation.area_under_roc(class_index=0)  #ROC, no std of kfold
Exemple #31
0
    def load(path, db):
        nominals = [
            49,  # dev_double_fp_config
            50,  # dev_endian_little
            51,  # dev_execution_capabilities
            52,  # dev_extensions
            54,  # dev_global_mem_cache_type
            57,  # dev_host_unified_memory
            63,  # dev_image_support
            65,  # dev_local_mem_type
            96,  # dev_queue_properties
            97,  # dev_single_fp_config
            98,  # dev_type
            100,  # dev_vendor_id
        ]
        nominal_indices = ",".join([str(index) for index in nominals])
        force_nominal = ["-N", nominal_indices]

        # Load data from CSV.
        dataset = Dataset.load_csv(path, options=force_nominal)
        dataset.__class__ = Dataset

        # Set class index and database connection.
        dataset.class_index = -1
        dataset.db = db

        # Create string->nominal type attribute filter, ignoring the first
        # attribute (scenario ID), since we're not classifying with it.
        string_to_nominal = WekaFilter(
            classname=("weka.filters.unsupervised."
                       "attribute.StringToNominal"),
            options=["-R", "2-last"],
        )
        string_to_nominal.inputformat(dataset.instances)

        # Create filtered dataset, and swap data around.
        filtered = string_to_nominal.filter(dataset.instances)
        dataset.instances = filtered

        return dataset
Exemple #32
0
    def get_weka_breast_cancer(self):
        split_ratio = 0.2

        loader = Loader(classname="weka.core.converters.CSVLoader")
        loader.options = ['-F', ',']
        dataset = loader.load_file(
            os.path.join(DATASET_DIR, 'uci-20070111-breast-cancer.csv'))
        dataset.class_is_last()
        remove = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(split_ratio * 100)])
        remove.inputformat(dataset)
        train_set = remove.filter(dataset)
        remove = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(split_ratio * 100), "-V"])
        remove.inputformat(dataset)
        test_set = remove.filter(dataset)

        labels = dataset.class_attribute.values

        return train_set, test_set, labels
Exemple #33
0
    def load(path, db):
        nominals = [
            49,  # dev_double_fp_config
            50,  # dev_endian_little
            51,  # dev_execution_capabilities
            52,  # dev_extensions
            54,  # dev_global_mem_cache_type
            57,  # dev_host_unified_memory
            63,  # dev_image_support
            65,  # dev_local_mem_type
            96,  # dev_queue_properties
            97,  # dev_single_fp_config
            98,  # dev_type
            100, # dev_vendor_id
        ]
        nominal_indices = ",".join([str(index) for index in nominals])
        force_nominal = ["-N", nominal_indices]

        # Load data from CSV.
        dataset = Dataset.load_csv(path, options=force_nominal)
        dataset.__class__ = Dataset

        # Set class index and database connection.
        dataset.class_index = -1
        dataset.db = db

        # Create string->nominal type attribute filter, ignoring the first
        # attribute (scenario ID), since we're not classifying with it.
        string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised."
                                                  "attribute.StringToNominal"),
                                       options=["-R", "2-last"])
        string_to_nominal.inputformat(dataset.instances)

        # Create filtered dataset, and swap data around.
        filtered = string_to_nominal.filter(dataset.instances)
        dataset.instances = filtered

        return dataset
Exemple #34
0
    def load(path, db):
        nominals = [
            49,  # dev_global_mem_cache_type
            52,  # dev_host_unified_memory
            54,  # dev_local_mem_type
            56,  # dev_type
            57,  # dev_vendor
        ]
        nominal_indices = ",".join([str(index) for index in nominals])
        force_nominal = ["-N", nominal_indices]

        # Load data from CSV.
        dataset = Dataset.load_csv(path, options=force_nominal)
        dataset.__class__ = Dataset

        # Set class index and database connection.
        dataset.class_index = -1
        dataset.db = db

        # Create string->nominal type attribute filter, ignoring the first
        # attribute (scenario ID), since we're not classifying with it.
        string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised."
                                                  "attribute.StringToNominal"),
                                       options=["-R", "2-last"])
        string_to_nominal.inputformat(dataset.instances)

        # Create filtered dataset, and swap data around.
        filtered = string_to_nominal.filter(dataset.instances)

        # Create nominal->binary type attribute filter, ignoring the
        # first attribute (scenario ID), since we're not classifying with it.
        n2b = WekaFilter(classname="weka.filters.unsupervised.attribute.NominalToBinary",
                         options=["-R", "2-last"])
        n2b.inputformat(filtered)

        dataset.instances = n2b.filter(filtered)

        return dataset
Exemple #35
0
def obtainBayesNet(file):
    #The path of the arff extension file must be put.
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")

    #In the case of this specific data set, the first two attributes were removed since they
    #   represent the name and ranking which are unique values that would affect the classification.
    #   Depending on the data set, certain attributes must be removed.
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    #It is specified that the class value is the last attribute.
    data.class_is_last()

    #Define the classifier to be used.
    classifier = Classifier(classname="weka.classifiers.bayes.BayesNet")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    #The ROC-AUC is extracted from the string that is received from Weka.
    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
Exemple #36
0
	def select_missclassified(self):
		remove = Filter(classname="weka.filters.supervised.attribute.AddClassification", options=['-classification' ,'-error' ,'-W' ,self.base_classifier.to_commandline()])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

		remove = Filter(classname="weka.filters.unsupervised.instance.RemoveWithValues", options=['-S','0.0','-C','last','-L','last','-V'])
		remove.inputformat(self.data)

		remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=['-R',str(self.data.num_attributes-2)+',last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'):
    """
    Creates model and classifies against input data. Returns accuracy statistics
    """
    # set seed so results are consistent
    random.seed('iot')

    # load data
    loader = Loader(classname='weka.core.converters.CSVLoader')
    data = loader.load_file(infile)
    data.class_is_last()

    # convert all numeric attributes to nominal
    to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal',
                        options=['-R', 'first-last'])
    to_nominal.inputformat(data)
    data = to_nominal.filter(data)

    # randomize data with constant seed
    randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize',
                       options=['-S', '42'])
    randomize.inputformat(data)

    data = randomize.filter(data)

    # create training set and testing set
    train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage',
                                  options=['-P', percentage, '-V'])
    train_percent_filter.inputformat(data)

    train = train_percent_filter.filter(data)
    test = data

    # build and test classifier
    classifier.build_classifier(train)
    evaluation = Evaluation(train)
    evaluation.test_model(classifier, test)

    # return results as array
    results = [
        approach_name,
        classifier_name,
        percentage,
        evaluation.percent_correct,
        evaluation.weighted_f_measure
    ]
    return results
def testing():
    logging.disable("weka")

    print "PROSES KLASIFIKASI\n------------------"

    jvm.start()

    pruning = 0
    while pruning < 2:

        persen_train = 0
        while persen_train < 4:

            fitur_hapus = 15
            while fitur_hapus >= 0:

                list_akurasi = []
                list_recall = []
                list_presisi = []
                list_fmeasure = []
                list_roc = []
                count = 0

                nama = "hasilTest/"
                if(pruning == 0):
                    nama += "unpruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"
                else:
                    nama += "pruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"

                if(fitur_hapus > 0):
                    nama += "removeF" + str(fitur_hapus) + ".txt"
                else:
                    nama += "normal.txt"

                f = open(nama, "w")

                if(pruning == 0):
                    nama = "unpruning"
                    print "Tanpa Pruning"
                    f.write("Hasil Decision Tree C4.5 tanpa Pruning (unpruning)\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")
                else:
                    nama = "pruning"
                    print "Dengan Pruning"
                    f.write("Hasil Decision Tree C4.5 Pruning\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")

                if(fitur_hapus > 0):
                    f.write("Menggunakan remove pada fitur " + str(fitur_hapus) + "\n\n")
                else:
                    f.write("\n")

                f.write("No. Akurasi Recall Presisi F-Measure ROC\n")

                if persen_train == 0:
                    print "40% Data Training"
                elif persen_train == 1:
                    print "50% Data Training"
                elif persen_train == 2:
                    print "60% Data Training"
                else:
                    print "70% Data Training"

                print "Fitur yang dihapus:", fitur_hapus
                print "\nNo.\tAkurasi\tRecall\tPresisi\tF-Measure\tROC"
                while count < 100:
                    loader = Loader(classname = "weka.core.converters.ArffLoader")
                    data = loader.load_file("hasil.arff")
                    data.class_is_last()

                    if(fitur_hapus > 0):
                        remove = Filter(classname = "weka.filters.unsupervised.attribute.Remove", options = ["-R", str(fitur_hapus)])
                        remove.inputformat(data)
                        data_baru = remove.filter(data)
                        data_baru.class_is_last()
                    else:
                        data_baru = loader.load_file("hasil.arff")
                        data_baru.class_is_last()

                    filter = Filter(classname = "weka.filters.unsupervised.instance.Randomize", options = ["-S", str(int(time.time()))])
                    filter.inputformat(data_baru)
                    data_random = filter.filter(data_baru)
                    data_random.class_is_last()

                    if(pruning == 0):
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-U"])
                    else:
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-C", "0.25"])

                    evaluation = Evaluation(data_random)
                    if(persen_train == 0):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 40)
                    elif(persen_train == 1):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 50)
                    elif(persen_train == 2):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 60)
                    else:
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 70)

                    f.write(str(count + 1) + str( ". " ) + str(evaluation.weighted_true_positive_rate) + str( " " ) + str(evaluation.weighted_recall) + str( " " ) + str(evaluation.weighted_precision) + str( " " ) + str(evaluation.weighted_f_measure) + str( " " ) + str(evaluation.weighted_area_under_roc) + "\n")
                    print count + 1, evaluation.weighted_true_positive_rate, evaluation.weighted_recall, evaluation.weighted_precision, evaluation.weighted_f_measure, evaluation.weighted_area_under_roc

                    list_akurasi.append(evaluation.weighted_true_positive_rate)
                    list_recall.append(evaluation.weighted_recall)
                    list_presisi.append(evaluation.weighted_precision)
                    list_fmeasure.append(evaluation.weighted_f_measure)
                    list_roc.append(evaluation.weighted_area_under_roc)

                    count += 1
                    time.sleep(1)

                list_akurasi.sort()
                list_recall.sort()
                list_presisi.sort()
                list_fmeasure.sort()
                list_roc.sort()

                f.write( ""  + "\n")
                f.write( "Rata-Rata"  + "\n")
                f.write( "Akurasi:" + str(sum(list_akurasi) / 100.0)  + "\n")
                f.write( "Recall:" + str(sum(list_recall) / 100.0)  + "\n")
                f.write( "Presisi:" + str(sum(list_presisi) / 100.0)  + "\n")
                f.write( "F-Measure:" + str(sum(list_fmeasure) / 100.0)  + "\n")
                f.write( "ROC:" + str(sum(list_roc) / 100.0)  + "\n")
                f.write( ""  + "\n")
                f.write( "Max"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[-1] ) + "\n")
                f.write( "Recall:" + str(list_recall[-1] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[-1] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[-1] ) + "\n")
                f.write( "ROC:" + str(list_roc[-1] ) + "\n")
                f.write( ""  + "\n")
                f.write( "Min"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[0] ) + "\n")
                f.write( "Recall:" + str(list_recall[0] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[0] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[0] ) + "\n")
                f.write( "ROC:" + str(list_roc[0] ) + "\n")
                f.write( ""  + "\n")

                print ""
                print "Rata-Rata"
                print "Akurasi:", sum(list_akurasi) / 100.0
                print "Recall:", sum(list_recall) / 100.0
                print "Presisi:", sum(list_presisi) / 100.0
                print "F-Measure:", sum(list_fmeasure) / 100.0
                print "ROC:", sum(list_roc) / 100.0
                print ""
                print "Max"
                print "Akurasi:", list_akurasi[-1]
                print "Recall:", list_recall[-1]
                print "Presisi:", list_presisi[-1]
                print "F-Measure:", list_fmeasure[-1]
                print "ROC:", list_roc[-1]
                print ""
                print "Min"
                print "Akurasi:", list_akurasi[0]
                print "Recall:", list_recall[0]
                print "Presisi:", list_presisi[0]
                print "F-Measure:", list_fmeasure[0]
                print "ROC:", list_roc[0]
                print ""

                f.close()
                fitur_hapus -= 1

            persen_train += 1

        pruning += 1

    jvm.stop()
Exemple #39
0
# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# simulate the 10 train/test pairs of cross-validation
evl = Evaluation(data)
for i in xrange(1, 11):
    # create train set
    remove = Filter(
        classname="weka.filters.supervised.instance.StratifiedRemoveFolds",
        options=["-N", "10", "-F", str(i), "-S", "1", "-V"])
    remove.inputformat(data)
    train = remove.filter(data)

    # create test set
    remove = Filter(
        classname="weka.filters.supervised.instance.StratifiedRemoveFolds",
        options=["-N", "10", "-F", str(i), "-S", "1"])
    remove.inputformat(data)
    test = remove.filter(data)

    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    evl.test_model(cls, test)

print("Simulated CV accuracy: %0.1f%%" % evl.percent_correct)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "vote.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = Classifier(classname="weka.classifiers.trees.J48")

    # randomize data
    folds = 10
    seed = 1
    rnd = Random(seed)
    rand_data = Instances.copy_instances(data)
    rand_data.randomize(rnd)
    if rand_data.class_attribute.is_nominal:
        rand_data.stratify(folds)

    # perform cross-validation and add predictions
    predicted_data = None
    evaluation = Evaluation(rand_data)
    for i in xrange(folds):
        train = rand_data.train_cv(folds, i)
        # the above code is used by the StratifiedRemoveFolds filter,
        # the following code is used by the Explorer/Experimenter
        # train = rand_data.train_cv(folds, i, rnd)
        test = rand_data.test_cv(folds, i)

        # build and evaluate classifier
        cls = Classifier.make_copy(classifier)
        cls.build_classifier(train)
        evaluation.test_model(cls, test)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification", "-distribution", "-error"])
        # setting the java object directory avoids issues with correct quoting in option array
        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        addcls.filter(train)  # trains the classifier
        pred = addcls.filter(test)
        if predicted_data is None:
            predicted_data = Instances.template_instances(pred, 0)
        for n in xrange(pred.num_instances):
            predicted_data.add_instance(pred.get_instance(n))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
    print("")
    print(predicted_data)
Exemple #41
0
def run_classifier(path, prot, sel, cols, prot_vals, beta):
        
    DIs = dict()
    jvm.start()

    for i in range(len(cols)-1):
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file(path)
    
        # remove selected attribute from the data
        # NOTE: options are ONE indexed, not ZERO indexed
        remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                        options=["-R", str(sel[2]+1)])
        remove.inputformat(data)
        data = remove.filter(data)

        # if running for only one attribue, remove all others (except protected)
        if i > 0:
            for j in range(1, prot[2]+1):
                if i != j:
                    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                                    options=["-R", ("1" if i>j else "2")])
                    remove.inputformat(data)
                    data = remove.filter(data)

        # set prot attribute as Class attribute
        data.class_is_last()
        
        # run classifier
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(data)
    
        # count the number of each combination
        pos_and_pred = float(0.0)
        pos_and_not_pred = float(0.0)
        neg_and_pred = float(0.0)
        neg_and_not_pred = float(0.0)
        for ind, inst in enumerate(data):
            if cls.classify_instance(inst):
                if prot_vals[ind] == prot[1]:
                    pos_and_pred += 1
                else:
                    neg_and_pred += 1
            else:
                if prot_vals[ind] == prot[1]:
                    pos_and_not_pred += 1
                else:
                    neg_and_not_pred += 1

        # calculate DI
        BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \
               (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5
        if BER > 0.5:
            BER = 1 - BER
        DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER))

        if i == 0: # consider changing this to a 'code word' instead of 'all'
            DIs["all"] = DI
        else:
            DIs[cols[i-1]] = DI

    jvm.stop()

    return DIs
Exemple #42
0
from weka.core.converters import Loader
from weka.clusterers import Clusterer, ClusterEvaluation
from weka.filters import Filter
import weka.plot.clusterers as plc

jvm.start()

# load iris
fname = data_dir + os.sep + "iris.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# remove class attribute
flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
flt.inputformat(data)
filtered = flt.filter(data)

# build KMeans
print("\n--> SimpleKMeans\n")
cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
print(evl.cluster_results)
plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True)

# use AddCluster filter
print("\n--> AddCluster filter\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
Exemple #43
0
fname = data_dir + os.sep + "glass.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate J48
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("All attributes: %0.0f%%" % evl.percent_correct)

# remove attributes (1) and cross-validate J48
atts = "RI|Mg|Type"
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"])
flt.inputformat(data)
filtered = flt.filter(data)
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1))
print(atts + ": %0.0f%%" % evl.percent_correct)

# remove attributes (2) and cross-validate J48
atts = "RI|Na|Mg|Ca|Ba|Type"
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"])
flt.inputformat(data)
filtered = flt.filter(data)
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1))
print(atts + ": %0.0f%%" % evl.percent_correct)
Exemple #44
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
Exemple #45
0
	def merge_nominal_attributes(self, significance=0.01):
		remove = Filter(classname="weka.filters.supervised.attribute.MergeNominalValues", options=['-L',str(significance),'-R','first-last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris)

    # remove class attribute
    helper.print_info("Removing class attribute")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    remove.inputformat(data)
    filtered = remove.filter(data)

    # use MultiFilter
    helper.print_info("Use MultiFilter")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    std = Filter(classname="weka.filters.unsupervised.attribute.Standardize")
    multi = MultiFilter()
    multi.filters = [remove, std]
    multi.inputformat(data)
    filtered_multi = multi.filter(data)

    # output datasets
    helper.print_title("Input")
    print(data)
    helper.print_title("Output")
    print(filtered)
    helper.print_title("Output (MultiFilter)")
    print(filtered_multi)

    # load text dataset
    text = helper.get_data_dir() + os.sep + "reutersTop10Randomized_1perc_shortened.arff"
    helper.print_info("Loading dataset: " + text)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(text)
    data.class_is_last()

    # apply StringToWordVector
    stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer")
    stopwords = Stopwords(classname="weka.core.stopwords.Rainbow")
    tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
    s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"])
    s2wv.stemmer = stemmer
    s2wv.stopwords = stopwords
    s2wv.tokenizer = tokenizer
    s2wv.inputformat(data)
    filtered = s2wv.filter(data)

    helper.print_title("Input (StringToWordVector)")
    print(data)
    helper.print_title("Output (StringToWordVector)")
    print(filtered)

    # partial classname
    helper.print_title("Creating filter from partial classname")
    clsname = ".Standardize"
    f = Filter(classname=clsname)
    print(clsname + " --> " + f.classname)

    # source code
    helper.print_info("Generate source code")
    bolts = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + bolts)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(bolts)
    replace = Filter(classname="weka.filters.unsupervised.attribute.ReplaceMissingValues")
    replace.inputformat(data)
    replace.filter(data)
    print(replace.to_source("MyReplaceMissingValues", data))
Exemple #47
0
 def remove_attributes(self, *attributes):
     indices = [self.attribute_index(x) for x in attributes]
     remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", ','.join(str(x + 1) for x in indices)])
     remove.inputformat(self.instances)
     self.instances = remove.filter(self.instances)
Exemple #48
0
from weka.core.converters import Loader, Saver
from weka.core.dataset import Instances
from weka.filters import Filter

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# output header
print(Instances.template_instances(data))

# remove attribute no 3
print("\nRemove attribute no 3")
fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"])
fltr.inputformat(data)
filtered = fltr.filter(data)

# output header
print(Instances.template_instances(filtered))

# save modified dataset
saver = Saver(classname="weka.core.converters.ArffSaver")
saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff")

jvm.stop()

Exemple #49
0
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation, PredictionOutput
from weka.filters import Filter

jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
# we'll set the class attribute after filtering

# apply NominalToBinary filter and set class attribute
fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary")
fltr.inputformat(data)
filtered = fltr.filter(data)
filtered.class_is_last()

# cross-validate LinearRegression on filtered data, display model
cls = Classifier(classname="weka.classifiers.functions.LinearRegression")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)
print("10-fold cross-validation:\n" + evl.summary())
print("Predictions:\n\n" + str(pout))
cls.build_classifier(filtered)
print("Model:\n\n" + str(cls))

# use AddClassification filter with LinearRegression on filtered data
print("Applying AddClassification to filtered data:\n")