def emlimitateUnusedFeature(self, trainData, testData = None):
        trainData.set_class_index(trainData.num_attributes() - 1)   # set class attribute
        featureIndex = -1       
        filteredTrainData = trainData
        filteredTestData = testData
        

        attribute_index = 0

        while attribute_index < filteredTrainData.num_attributes() - 1:
            sampleCoverage = 0
            #print attribute_index
            # check value for current feature in each instance
            for instance_index in range(0, filteredTrainData.num_instances()):
                instance = filteredTrainData.get_instance(instance_index)
                value = instance.get_value(attribute_index)
                
                if value > 0:
                    sampleCoverage += 1
            if sampleCoverage == 0:
                #print "found"
                remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1
                remove.set_inputformat(filteredTrainData)
                filteredTrainData = remove.filter(filteredTrainData)  
                if filteredTestData:
                    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1
                    remove.set_inputformat(filteredTestData)
                    filteredTestData = remove.filter(filteredTestData)  
            else:
                attribute_index += 1

        return [filteredTrainData, filteredTestData]
Exemple #2
0
def run(dataset_path):
    start = time.time()

    ### load a dataset ###
    train_data = model.load_dataset_weka(dataset_path)  #
    to_nomial_class_filter = Filter(
        classname="weka.filters.unsupervised.attribute.NumericToNominal",
        options=["-R", "last"])
    to_nomial_class_filter.inputformat(train_data)

    ###  Naive Bayes ### Choose what you want
    classifier = Classifier("weka.classifiers.bayes.NaiveBayesMultinomial")
    # classifier = Classifier("weka.classifiers.bayes.NaiveBayes")
    # classifier.build_classifer(train_data)
    evaluation = Evaluation(to_nomial_class_filter.filter(train_data))
    evaluation.crossvalidate_model(classifier,
                                   to_nomial_class_filter.filter(train_data),
                                   10, Random(42))
    # print(evaluation.summary())
    # print(evaluation.class_details())
    # print(evaluation.matrix())

    # ###  Naive Bayes ###
    # mlp = Classifier("weka.classifiers.bayes.Naive Bayes")
    # mlp.build_classifer(train_file_5EMO)

    print(time.time() - start)
Exemple #3
0
    def createTwoDatasets(self,
                          wholeDataPath,
                          trainingDataPercentage,
                          trainingPath,
                          testingPath,
                          shuffleSeed=43):
        wholeData = self.load_Arff(wholeDataPath)
        randomize = Filter(
            classname="weka.filters.unsupervised.instance.Randomize",
            options=["-S", str(shuffleSeed)])
        randomize.set_inputformat(wholeData)
        wholeData = randomize.filter(wholeData)

        removePercentage = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(trainingDataPercentage), "-V"])
        removePercentage.set_inputformat(wholeData)
        trainingData = removePercentage.filter(wholeData)
        print "instances:" + str(trainingData.num_instances())

        removePercentage = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(trainingDataPercentage)])
        removePercentage.set_inputformat(wholeData)
        testingData = removePercentage.filter(wholeData)

        print "instances:" + str(testingData.num_instances())

        self.save_Arff(trainingData, trainingPath)
        self.save_Arff(testingData, testingPath)
Exemple #4
0
	def select_missclassified(self):
		remove = Filter(classname="weka.filters.supervised.attribute.AddClassification", options=['-classification' ,'-error' ,'-W' ,self.base_classifier.to_commandline()])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

		remove = Filter(classname="weka.filters.unsupervised.instance.RemoveWithValues", options=['-S','0.0','-C','last','-L','last','-V'])
		remove.inputformat(self.data)

		remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=['-R',str(self.data.num_attributes-2)+',last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)
    def _get_training_dataset(self, X, y):
        # convert to numpy array
        if isinstance(X, pd.DataFrame):
            X = X.values
        elif isinstance(X, list):
            X = np.array(X)
        elif not isinstance(X, np.ndarray):
            raise Exception("Incompatible data type: {}".format(type(X)))
        if isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, list):
            y = np.array(y)
        elif not isinstance(y, np.ndarray):
            raise Exception("Incompatible data type: {}".format(type(y)))

        if y.dtype == "O":
            for i in range(0, len(y)):
                try:
                    y[i] = y[i].encode()
                except:
                    pass
        dataset = create_instances_from_matrices(
            X, y, name="generated from matrices")  # generate dataset

        # convert label to nominal
        try:
            y.astype(float)
            self._label_type = np.float64
            nominal = Filter(
                classname=
                "weka.filters.unsupervised.attribute.NumericToNominal",
                options=["-R", "last"])
        except ValueError:
            self._label_type = str
            nominal = Filter(
                classname="weka.filters.unsupervised.attribute.StringToNominal",
                options=["-R", "last"])
        nominal.inputformat(dataset)
        dataset = nominal.filter(dataset)

        # sort labels
        sorter = Filter(
            classname="weka.filters.unsupervised.attribute.SortLabels")
        sorter.inputformat(dataset)
        dataset = sorter.filter(dataset)

        dataset.class_is_last()  # indicate class label

        return dataset
Exemple #6
0
 def _pre_process_to_classification(self, dataset):   
     filter_data = Filter(classname = 'weka.filters.unsupervised.attribute.MathExpression', 
                          options = ['-unset-class-temporarily', '-E', "ifelse ( A>0, 1, 0 )", 
                                     '-V', '-R', 'last'])
     
     filter_data.set_inputformat(dataset)
     filtered = filter_data.filter(dataset)
     
     discretize_data = Filter(classname = 'weka.filters.unsupervised.attribute.NumericToNominal', 
                          options = ['-R', 'last'])
     
     discretize_data.set_inputformat(filtered)
     discretized = discretize_data.filter(filtered)
     
     return discretized
Exemple #7
0
def discretize_data(input_data):
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10"])
    discretize.inputformat(input_data)
    filtered_data = discretize.filter(input_data)
    return filtered_data
Exemple #8
0
def supFilters(data, fType, ops):

	filt = Filter(classname="weka.filters.supervised." + fType, options = ops)
	filt.inputformat(data)     # let the filter know about the type of data to filter
	filtered = filt.filter(data)   # filter the data
	
	return filtered
def affective_vectorizer(tweets, filename):
    '''
    Vectorizes the tweets and saves the vectors as csv.
    :param tweets: list of tweets
    :param filename: name of the saved file
    '''
    jvm.start(packages=True)
    install_package('AffectiveTweets')

    data = dataset.create_instances_from_lists([[t] for t in tweets])

    filter = Filter(
        classname=
        'weka.filters.unsupervised.attribute.TweetToLexiconFeatureVector',
        options=[
            '-F', '-D', '-R', '-A', '-T', '-L', '-N', '-P', '-J', '-H', '-Q',
            '-stemmer', 'weka.core.stemmers.NullStemmer', '-stopwords-handler',
            'weka.core.tokenizers.TweetNLPTokenizer', '-I', '1', '-U',
            '-tokenizer', 'weka.core.tokenizers.TweetNLPTokenizer'
        ])
    filter.inputformat(data)
    filtered_data = filter.filter(data)

    converters.save_any_file(filtered_data, 'data/affect-vectors/' + filename)

    jvm.stop()
Exemple #10
0
def runSMO(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])

    cls = KernelClassifier(
        classname="weka.classifiers.functions.SMO",
        options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"])
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.PolyKernel",
        options=["-C", "250007", "-E", "1.0"])
    cls.kernel = kernel
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)

    #print(pout.buffer_content())

    print(evl.percent_correct)
    #print(evl.summary())

    result = evl.class_details()
    print(result)
    return result
Exemple #11
0
def create_model(input_file, output_file):
    # Load data
    data = converters.load_any_file(input_file)
    data.class_is_last()  # set class attribute

    # filter data
    print_title("Filtering Data")
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10", "-M", "-1.0", "-R", "first-last"])
    discretize.inputformat(
        data)  # let the filter know about the type of data to filter
    filtered_data = discretize.filter(data)
    print("Done! (believe it or not)")

    print_title("Build Classifier")
    classifier = Classifier(classname="weka.classifiers.trees.RandomForest",
                            options=["-I", "100", "-K", "0", "-S", "1"])
    classifier.build_classifier(filtered_data)
    print("Done! (believe it or not)")
    serialization.write_all(output_file, [classifier, discretize])
    print("Model and filter saved to ", output_file)

    evaluation = Evaluation(data)  # initialize with priors
    evaluation.crossvalidate_model(classifier, filtered_data, 10,
                                   Random(42))  # 10-fold CV
    print(evaluation.summary())
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
Exemple #12
0
def Feature_Selection(infile):
    directory = os.getcwd() + '/'
    csvpath = directory + infile

    jvm.start(packages=True, max_heap_size="4g")
    print "\n\n"
    print "Loaded file: ", infile
    csvloader = Loader(classname="weka.core.converters.CSVLoader")
    csvdata = csvloader.load_file(csvpath)

    remover = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", " 1"])
    remover.inputformat(csvdata)
    filtered_data = remover.filter(csvdata)
    filtered_data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "-E", "1"])
    attribs = AttributeSelection()
    attribs.search(search)
    attribs.evaluator(evaluator)
    attribs.select_attributes(filtered_data)
    print "Summary of Attribute Selection: "
    print attribs.results_string
    jvm.stop()
    return
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'):
    """
    Creates model and classifies against input data. Returns accuracy statistics
    """
    # set seed so results are consistent
    random.seed('iot')

    # load data
    loader = Loader(classname='weka.core.converters.CSVLoader')
    data = loader.load_file(infile)
    data.class_is_last()

    # convert all numeric attributes to nominal
    to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal',
                        options=['-R', 'first-last'])
    to_nominal.inputformat(data)
    data = to_nominal.filter(data)

    # randomize data with constant seed
    randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize',
                       options=['-S', '42'])
    randomize.inputformat(data)

    data = randomize.filter(data)

    # create training set and testing set
    train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage',
                                  options=['-P', percentage, '-V'])
    train_percent_filter.inputformat(data)

    train = train_percent_filter.filter(data)
    test = data

    # build and test classifier
    classifier.build_classifier(train)
    evaluation = Evaluation(train)
    evaluation.test_model(classifier, test)

    # return results as array
    results = [
        approach_name,
        classifier_name,
        percentage,
        evaluation.percent_correct,
        evaluation.weighted_f_measure
    ]
    return results
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris)

    # remove class attribute
    helper.print_info("Removing class attribute")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "last"])
    remove.inputformat(data)
    filtered = remove.filter(data)

    # use MultiFilter
    helper.print_info("Use MultiFilter")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "first"])
    std = Filter(classname="weka.filters.unsupervised.attribute.Standardize")
    multi = MultiFilter()
    multi.filters = [remove, std]
    multi.inputformat(data)
    filtered_multi = multi.filter(data)

    # output datasets
    helper.print_title("Input")
    print(data)
    helper.print_title("Output")
    print(filtered)
    helper.print_title("Output (MultiFilter)")
    print(filtered_multi)

    # load text dataset
    text = helper.get_data_dir(
    ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff"
    helper.print_info("Loading dataset: " + text)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(text)
    data.class_is_last()

    # apply StringToWordVector
    stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer")
    stopwords = Stopwords(classname="weka.core.stopwords.Rainbow")
    tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
    s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"])
    s2wv.stemmer = stemmer
    s2wv.stopwords = stopwords
    s2wv.tokenizer = tokenizer
    s2wv.inputformat(data)
    filtered = s2wv.filter(data)

    helper.print_title("Input (StringToWordVector)")
    print(data)
    helper.print_title("Output (StringToWordVector)")
    print(filtered)
Exemple #15
0
def discretize(data, index, file):
    discretizer = Filter(
        classname='weka.filters.supervised.attribute.Discretize',
        options=["-R", str(index), "-precision", "6"])
    discretizer.inputformat(data)
    newData = discretizer.filter(data)
    discretizer.serialize(file)
    return newData
Exemple #16
0
def remove(data, indecies, file):
    cmdIndex = ','.join(indecies)
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", cmdIndex])
    remove.inputformat(data)
    newData = remove.filter(data)
    remove.serialize(file)
    return newData
Exemple #17
0
def smote(data, percentage):
    sampler = Filter(
        classname='weka.filters.supervised.instance.SMOTE',
        options=["-C", "0", "-K", "5", "-P",
                 str(percentage), "-S", "1"])
    sampler.inputformat(data)
    newData = sampler.filter(data)
    return newData
Exemple #18
0
def undersample(data, percentage):
    if (percentage >= 100):
        return None
    sampler = Filter(classname='weka.filters.supervised.instance.Resample',
                     options=["-B", "1.0", "-S", "1", "-Z",
                              str(percentage)])
    sampler.inputformat(data)
    newData = sampler.filter(data)
    return newData
Exemple #19
0
def stringToNominal(data, indecies, file):
    cmdIndex = ','.join(indecies)
    stn = Filter(
        classname="weka.filters.unsupervised.attribute.StringToNominal",
        options=["-R", cmdIndex])
    stn.inputformat(data)
    newData = stn.filter(data)
    stn.serialize(file)
    return newData
Exemple #20
0
    def attributeSelector(self, data, selectNum):
        attributeSelector = Filter(classname="weka.filters.supervised.attribute.AttributeSelection",\
                         options=["-S", "weka.attributeSelection.Ranker -T -1.7976931348623157E308 -N " + str(selectNum),\
                                   "-E", "weka.attributeSelection.InfoGainAttributeEval"])

        attributeSelector.set_inputformat(data)
        data = attributeSelector.filter(data)

        return data
    def filterUnusedFeatureFromList(self, data, unusedFuncitonList):
        filteredData = data

        for attribute in unusedFuncitonList:                
            remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"])
            remove.set_inputformat(filteredData)
            filteredData = remove.filter(filteredData)

        return filteredData      
    def attributeSelector(self, data, selectNum):
        attributeSelector = Filter(classname="weka.filters.supervised.attribute.AttributeSelection",\
                         options=["-S", "weka.attributeSelection.Ranker -T -1.7976931348623157E308 -N " + str(selectNum),\
                                   "-E", "weka.attributeSelection.InfoGainAttributeEval"])

        attributeSelector.set_inputformat(data)
        data = attributeSelector.filter(data)

            
        return data
 def createTwoDatasets(self, wholeDataPath, trainingDataPercentage, trainingPath, testingPath, shuffleSeed = 43):
     wholeData = self.load_Arff(wholeDataPath)
     randomize = Filter(classname="weka.filters.unsupervised.instance.Randomize", options=["-S", str(shuffleSeed)])
     randomize.set_inputformat(wholeData)
     wholeData = randomize.filter(wholeData)
     
     removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage), "-V"])
     removePercentage.set_inputformat(wholeData)
     trainingData = removePercentage.filter(wholeData)
     print "instances:" + str(trainingData.num_instances())
     
     removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage)])
     removePercentage.set_inputformat(wholeData)
     testingData = removePercentage.filter(wholeData)
     
     print "instances:" + str(testingData.num_instances())
     
     self.save_Arff(trainingData, trainingPath)
     self.save_Arff(testingData, testingPath)
Exemple #24
0
    def load(path, db):
        nominals = [
            49,  # dev_global_mem_cache_type
            52,  # dev_host_unified_memory
            54,  # dev_local_mem_type
            56,  # dev_type
            57,  # dev_vendor
        ]
        nominal_indices = ",".join([str(index) for index in nominals])
        force_nominal = ["-N", nominal_indices]

        # Load data from CSV.
        dataset = Dataset.load_csv(path, options=force_nominal)
        dataset.__class__ = Dataset

        # Set class index and database connection.
        dataset.class_index = -1
        dataset.db = db

        # Create string->nominal type attribute filter, ignoring the first
        # attribute (scenario ID), since we're not classifying with it.
        string_to_nominal = WekaFilter(
            classname=("weka.filters.unsupervised."
                       "attribute.StringToNominal"),
            options=["-R", "2-last"],
        )
        string_to_nominal.inputformat(dataset.instances)

        # Create filtered dataset, and swap data around.
        filtered = string_to_nominal.filter(dataset.instances)

        # Create nominal->binary type attribute filter, ignoring the
        # first attribute (scenario ID), since we're not classifying with it.
        n2b = WekaFilter(
            classname="weka.filters.unsupervised.attribute.NominalToBinary",
            options=["-R", "2-last"],
        )
        n2b.inputformat(filtered)

        dataset.instances = n2b.filter(filtered)

        return dataset
 def getSetDataBySetIndex(self, data, index):
     # cut feature set out
     featureTable = FeatureTable()
     startIndexList = featureTable.getEachSetStartIndex()
     
     start = startIndexList[index]
     end = startIndexList[index+1] - 1
     remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-V", "-R", str(start) + "-" + str(end) + ",last"])
     remove.set_inputformat(data)
     filteredData = remove.filter(data)
     return filteredData
Exemple #26
0
    def exposed_evaluate(self, X, d, task, i_model, i_evl):
        data = np.reshape(eval(X), [d, -1], order='C')
        if task == 'regression':
            if i_model == 'LR':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.functions.LinearRegression')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'RF':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            if i_evl == 'mae':
                r_mae = evl.mean_absolute_error
                return r_mae
            elif i_evl == 'mse':
                r_mae = evl.mean_square_error
                return r_mse
            elif i_evl == '1-rae':
                r_one_minus_rae = 1 - evl.relative_absolute_error / 100
                del evl, model, data
                return r_one_minus_rae

        elif task == 'classification':
            le = LabelEncoder()
            data[:, -1] = le.fit_transform(data[:, -1])
            if i_model == 'RF':
                dataRaw = converters.ndarray_to_instances(data, relation='tmp')
                weka_filter = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NumericToNominal",
                    options=["-R", "last"])
                weka_filter.inputformat(dataRaw)
                data = weka_filter.filter(dataRaw)
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'LR':
                model = LogisticRegression(multi_class='ovr')
            elif i_model == 'SVM':
                model = svm.SVC()
            if i_evl == 'f_score':
                fscore = evl.weighted_f_measure
                del evl, model, data, dataRaw
                if not (fscore >= 0.01 and fscore < 1.01):
                    fscore = 0.01
                return fscore
Exemple #27
0
    def filterUnusedFeatureFromList(self, data, unusedFuncitonList):
        filteredData = data

        for attribute in unusedFuncitonList:
            remove = Filter(
                classname="weka.filters.unsupervised.attribute.RemoveByName",
                options=["-E", "^" + attribute + ".*$"])
            remove.set_inputformat(filteredData)
            filteredData = remove.filter(filteredData)

        return filteredData
Exemple #28
0
def make_partition(data, attributes, part='normal'):

    if part == 'normal':
        value = 'last'
    elif part == 'anomalous':
        value = 'first'

    keep_normal = Filter(
        classname='weka.filters.unsupervised.instance.RemoveWithValues',
        options=['-C', 'last', '-L', value])
    keep_normal.inputformat(data)
    data_normal = keep_normal.filter(data)

    remove = Filter(classname='weka.filters.unsupervised.attribute.Remove',
                    options=['-R', 'last'])
    remove.inputformat(data)
    data_normal = remove.filter(data_normal)

    N = data_normal.num_instances

    return data_normal, N
Exemple #29
0
def merge_classes(data, idx_to_merge):
    """
    :param data: The data file to filter
    :param idx_to_merge: String representation of class indices to merge 
    :return: filtered data
    """
    merge_filter = Filter(
        classname="weka.filters.unsupervised.attribute.MergeManyValues",
        options=["-C", "last", "-R", idx_to_merge, "-unset-class-temporarily"])
    merge_filter.inputformat(data)
    filtered_data = merge_filter.filter(data)
    return filtered_data
Exemple #30
0
    def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile,
                                                      apiFile, indexInTable,
                                                      methodName,
                                                      databaseTable,
                                                      csvFilePath):
        outputStr = methodName + ","
        resultList = []
        # Get whole feature set of our approach
        filteredData = self.load_Arff(ourApproahFile)
        # Use this function to get selected API feature and save the unselected api in a list
        filterOutList = self.attribueSelectionBasedOnRankingInDatabase(
            apiFile, indexInTable, databaseTable, "")[1]

        # Remove unselected API
        for functionName in filterOutList:
            functionName = functionName.split("(")[0] + "\(\)"
            functionName = functionName.replace('$', '\$')
            remove = Filter(
                classname="weka.filters.unsupervised.attribute.RemoveByName",
                options=["-E", "^" + functionName + ".*$"])
            remove.set_inputformat(filteredData)
            filteredData = remove.filter(filteredData)
        featureNum = filteredData.num_attributes() - 1
        print "featureNum: " + str(featureNum)
        if csvFilePath != "":
            self.writeTenScaledTitleManual(featureNum, csvFilePath)
            #print "i:" + str(i)
            #print "functionName:" + functionName
            #print "featureNum: " + str(filteredData.num_attributes() - 1)
        for attributeStr in filteredData.attributes():
            print(attributeStr)
        # Run ten scaled generation and evaluation
        step = 10
        while step < featureNum:
            roundData = self.attributeSelector(filteredData, step)
            classifier = self.algorithmPicker(roundData, indexInTable)
            evaluation = self.evaluation(classifier, roundData)
            #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))
            #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",")
            step += 10

        classifier = self.algorithmPicker(filteredData, indexInTable)
        evaluation = self.evaluation(classifier, filteredData)
        #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))

        # Write out to CSV file
        for item in resultList:
            outputStr += item + ","
        outputStr = outputStr[0:-1] + "\n"
        self.writeToPath(csvFilePath, outputStr)
Exemple #31
0
    def get_weka_breast_cancer(self):
        split_ratio = 0.2

        loader = Loader(classname="weka.core.converters.CSVLoader")
        loader.options = ['-F', ',']
        dataset = loader.load_file(
            os.path.join(DATASET_DIR, 'uci-20070111-breast-cancer.csv'))
        dataset.class_is_last()
        remove = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(split_ratio * 100)])
        remove.inputformat(dataset)
        train_set = remove.filter(dataset)
        remove = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(split_ratio * 100), "-V"])
        remove.inputformat(dataset)
        test_set = remove.filter(dataset)

        labels = dataset.class_attribute.values

        return train_set, test_set, labels
Exemple #32
0
	def remove_correct_classified(self, invert = False):
		options=[
			'-W', self.classifier.to_commandline(), 
			'-C', str(self.class_index), #classindex
	#		'-F','0', # folds
	#		'-T','0.1', #threshold by numeric classes
			'-I','0', # max iterations
			'-V' if not invert else '' 
		] # invert
		classname = "weka.filters.unsupervised.instance.RemoveMisclassified"
		remove = Filter(classname=classname, options=options)
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)
Exemple #33
0
    def load(path, db):
        nominals = [
            49,  # dev_global_mem_cache_type
            52,  # dev_host_unified_memory
            54,  # dev_local_mem_type
            56,  # dev_type
            57,  # dev_vendor
        ]
        nominal_indices = ",".join([str(index) for index in nominals])
        force_nominal = ["-N", nominal_indices]

        # Load data from CSV.
        dataset = Dataset.load_csv(path, options=force_nominal)
        dataset.__class__ = Dataset

        # Set class index and database connection.
        dataset.class_index = -1
        dataset.db = db

        # Create string->nominal type attribute filter, ignoring the first
        # attribute (scenario ID), since we're not classifying with it.
        string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised."
                                                  "attribute.StringToNominal"),
                                       options=["-R", "2-last"])
        string_to_nominal.inputformat(dataset.instances)

        # Create filtered dataset, and swap data around.
        filtered = string_to_nominal.filter(dataset.instances)

        # Create nominal->binary type attribute filter, ignoring the
        # first attribute (scenario ID), since we're not classifying with it.
        n2b = WekaFilter(classname="weka.filters.unsupervised.attribute.NominalToBinary",
                         options=["-R", "2-last"])
        n2b.inputformat(filtered)

        dataset.instances = n2b.filter(filtered)

        return dataset
Exemple #34
0
    def emlimitateUnusedFeature(self, trainData, testData=None):
        trainData.set_class_index(trainData.num_attributes() -
                                  1)  # set class attribute
        featureIndex = -1
        filteredTrainData = trainData
        filteredTestData = testData

        attribute_index = 0

        while attribute_index < filteredTrainData.num_attributes() - 1:
            sampleCoverage = 0
            #print attribute_index
            # check value for current feature in each instance
            for instance_index in range(0, filteredTrainData.num_instances()):
                instance = filteredTrainData.get_instance(instance_index)
                value = instance.get_value(attribute_index)

                if value > 0:
                    sampleCoverage += 1
            if sampleCoverage == 0:
                #print "found"
                remove = Filter(
                    classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", str(attribute_index + 1)
                             ])  #The index in this function start from 1
                remove.set_inputformat(filteredTrainData)
                filteredTrainData = remove.filter(filteredTrainData)
                if filteredTestData:
                    remove = Filter(
                        classname="weka.filters.unsupervised.attribute.Remove",
                        options=["-R", str(attribute_index + 1)
                                 ])  #The index in this function start from 1
                    remove.set_inputformat(filteredTestData)
                    filteredTestData = remove.filter(filteredTestData)
            else:
                attribute_index += 1

        return [filteredTrainData, filteredTestData]
Exemple #35
0
    def getSetDataBySetIndex(self, data, index):
        # cut feature set out
        featureTable = FeatureTable()
        startIndexList = featureTable.getEachSetStartIndex()

        start = startIndexList[index]
        end = startIndexList[index + 1] - 1
        remove = Filter(
            classname="weka.filters.unsupervised.attribute.Remove",
            options=["-V", "-R",
                     str(start) + "-" + str(end) + ",last"])
        remove.set_inputformat(data)
        filteredData = remove.filter(data)
        return filteredData
def unsupervised_discretize(data):
    """
    Function for discretization of data. Function uses weka implementation
    weka.filters.unsupervised.attribute.Discretize.

    :param data: weka arff data
    :return: weka arff data
    """
    args, _sufix = unsupervised_discretize_parser()

    filt = Filter(classname='weka.filters.unsupervised.attribute.Discretize',
                  options=args_to_weka_options(args, _sufix))
    filt.inputformat(data)
    return filt.filter(data)
Exemple #37
0
 def filter_data(self, data):
     print("Filtering Data..\n")
     flter = Filter(
         classname="weka.filters.supervised.attribute.AttributeSelection")
     aseval = ASEvaluation(
         classname="weka.attributeSelection.CfsSubsetEval",
         options=["-P", "1", "-E", "1"])
     assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                         options=["-D", "1", "-N", "5"])
     flter.set_property("evaluator", aseval.jobject)
     flter.set_property("search", assearch.jobject)
     flter.inputformat(data)
     filtered = flter.filter(data)
     return filtered
def main():
    jvm.start()
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("train_sorted.arff")
    numofStores = 1115

    for storeNum in range(0, numofStores):

        tempData = data
        removeUpper = Filter(
            classname="weka.filters.unsupervised.instance.RemoveWithValues",
            options=[
                "-S",
                str(storeNum + 2) + ".0", "-C", "first", "-L", "first-last",
                "-V"
            ])
        removeUpper.inputformat(data)
        tempData = removeUpper.filter(data)

        removeLower = Filter(
            classname="weka.filters.unsupervised.instance.RemoveWithValues",
            options=[
                "-S",
                str(storeNum + 1) + ".0", "-C", "first", "-L", "first-last"
            ])
        removeLower.inputformat(tempData)
        tempData = removeLower.filter(tempData)

        #removing the storeID attribute
        tempData.delete_first_attribute()

        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(tempData, "stores/store" + str(storeNum + 1) + ".arff")
        print 'Saved Store' + str(storeNum + 1)

    jvm.stop()
def use_filter(data):
    """
    Uses the AttributeSelection filter for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n2. Filter")
    flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    flter.set_property("evaluator", aseval.jobject)
    flter.set_property("search", assearch.jobject)
    flter.inputformat(data)
    filtered = flter.filter(data)
    print(str(filtered))
def use_filter(data):
    """
    Uses the AttributeSelection filter for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n2. Filter")
    flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    flter.set_property("evaluator", aseval.jobject)
    flter.set_property("search", assearch.jobject)
    flter.inputformat(data)
    filtered = flter.filter(data)
    print(str(filtered))
Exemple #41
0
def obtainSVM(file):
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    data.class_is_last()

    classifier = Classifier(classname="weka.classifiers.functions.LibSVM")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
 def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile, apiFile, indexInTable, methodName, databaseTable, csvFilePath):
     outputStr = methodName+","
     resultList = []
     # Get whole feature set of our approach
     filteredData = self.load_Arff(ourApproahFile)
     # Use this function to get selected API feature and save the unselected api in a list
     filterOutList = self.attribueSelectionBasedOnRankingInDatabase(apiFile, indexInTable, databaseTable, "")[1]
     
     # Remove unselected API
     for functionName in filterOutList:
         functionName = functionName.split("(")[0] + "\(\)"
         functionName = functionName.replace('$','\$')
         remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"])
         remove.set_inputformat(filteredData)
         filteredData = remove.filter(filteredData)
     featureNum = filteredData.num_attributes() - 1
     print "featureNum: " + str(featureNum)
     if csvFilePath != "":
         self.writeTenScaledTitleManual(featureNum, csvFilePath)
         #print "i:" + str(i)
         #print "functionName:" + functionName
         #print "featureNum: " + str(filteredData.num_attributes() - 1)
     for attributeStr in filteredData.attributes():
         print(attributeStr)
     # Run ten scaled generation and evaluation 
     step = 10 
     while step < featureNum:
         roundData = self.attributeSelector(filteredData, step)
         classifier = self.algorithmPicker(roundData, indexInTable)
         evaluation = self.evaluation(classifier, roundData)
         #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum))
         resultList.append("{:.2f}".format(evaluation.percent_correct()))
         #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",")
         step += 10
     
     classifier = self.algorithmPicker(filteredData, indexInTable)
     evaluation = self.evaluation(classifier, filteredData)
     #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum))
     resultList.append("{:.2f}".format(evaluation.percent_correct()))
     
     # Write out to CSV file
     for item in resultList:
         outputStr += item +","
     outputStr = outputStr[0:-1] + "\n"
     self.writeToPath(csvFilePath, outputStr)
Exemple #43
0
    def load(path, db):
        nominals = [
            49,  # dev_double_fp_config
            50,  # dev_endian_little
            51,  # dev_execution_capabilities
            52,  # dev_extensions
            54,  # dev_global_mem_cache_type
            57,  # dev_host_unified_memory
            63,  # dev_image_support
            65,  # dev_local_mem_type
            96,  # dev_queue_properties
            97,  # dev_single_fp_config
            98,  # dev_type
            100, # dev_vendor_id
        ]
        nominal_indices = ",".join([str(index) for index in nominals])
        force_nominal = ["-N", nominal_indices]

        # Load data from CSV.
        dataset = Dataset.load_csv(path, options=force_nominal)
        dataset.__class__ = Dataset

        # Set class index and database connection.
        dataset.class_index = -1
        dataset.db = db

        # Create string->nominal type attribute filter, ignoring the first
        # attribute (scenario ID), since we're not classifying with it.
        string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised."
                                                  "attribute.StringToNominal"),
                                       options=["-R", "2-last"])
        string_to_nominal.inputformat(dataset.instances)

        # Create filtered dataset, and swap data around.
        filtered = string_to_nominal.filter(dataset.instances)
        dataset.instances = filtered

        return dataset
Exemple #44
0
from weka.clusterers import Clusterer, ClusterEvaluation
from weka.filters import Filter
import weka.plot.clusterers as plc

jvm.start()

# load iris
fname = data_dir + os.sep + "iris.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# remove class attribute
flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
flt.set_inputformat(data)
filtered = flt.filter(data)

# build KMeans
print("\n--> SimpleKMeans\n")
cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
print(evl.get_cluster_results())
plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True)

# use AddCluster filter
print("\n--> AddCluster filter\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
             options=["-W", "weka.clusterers.SimpleKMeans -N 3"])
Exemple #45
0
 def _normalize_dataset(self, dataset):
     normalize_data = Filter(classname = 'weka.filters.unsupervised.attribute.Normalize', 
                          options = [])
     normalize_data.set_inputformat(dataset)
     normalized = normalize_data.filter(dataset)
     return normalized
Exemple #46
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "vote.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = Classifier(classname="weka.classifiers.trees.J48")

    # randomize data
    folds = 10
    seed = 1
    rnd = Random(seed)
    rand_data = Instances.copy_instances(data)
    rand_data.randomize(rnd)
    if rand_data.class_attribute.is_nominal:
        rand_data.stratify(folds)

    # perform cross-validation and add predictions
    predicted_data = None
    evaluation = Evaluation(rand_data)
    for i in xrange(folds):
        train = rand_data.train_cv(folds, i)
        # the above code is used by the StratifiedRemoveFolds filter,
        # the following code is used by the Explorer/Experimenter
        # train = rand_data.train_cv(folds, i, rnd)
        test = rand_data.test_cv(folds, i)

        # build and evaluate classifier
        cls = Classifier.make_copy(classifier)
        cls.build_classifier(train)
        evaluation.test_model(cls, test)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification", "-distribution", "-error"])
        # setting the java object directory avoids issues with correct quoting in option array
        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        addcls.filter(train)  # trains the classifier
        pred = addcls.filter(test)
        if predicted_data is None:
            predicted_data = Instances.template_instances(pred, 0)
        for n in xrange(pred.num_instances):
            predicted_data.add_instance(pred.get_instance(n))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
    print("")
    print(predicted_data)
Exemple #48
0
from weka.core.converters import Loader, Saver
from weka.core.dataset import Instances
from weka.filters import Filter

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# output header
print(Instances.template_instances(data))

# remove attribute no 3
print("\nRemove attribute no 3")
fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"])
fltr.set_inputformat(data)
filtered = fltr.filter(data)

# output header
print(Instances.template_instances(filtered))

# save modified dataset
saver = Saver(classname="weka.core.converters.ArffSaver")
saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff")

jvm.stop()

Exemple #49
0
	def merge_nominal_attributes(self, significance=0.01):
		remove = Filter(classname="weka.filters.supervised.attribute.MergeNominalValues", options=['-L',str(significance),'-R','first-last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)
    def getTenScaledResultsRankedByInfo(self, trainingData, indexInTable, csvFilePath, testingData = None):
        dbmgr = permissionMappingManager(databasePath)
        featureNum = trainingData.num_attributes() - 1
        
        attributeIn = trainingData.attributes()
        attributeList = []
        for item in attributeIn:
            functionName = str(item).split(" ")[1]
            functionName = functionName.split("(")[0] + "\(\)"
            functionName = functionName.replace('$','\$')
            #print functionName
            attributeList.append(functionName)
        
        
        outputStr = ""
        outputStr += "InfomationGain" + ","
        resultList = []
        bestAccuracy = 0
        bestTrainData = 0
        bestTestData = 0
        
        #for index in range(0, len(attributeList)-1):
        #    attributeList[index] = attributeList[index].split(" ")[1]
        #    print attributeList[index]
        

        csvFile = open(csvFilePath, "a")
        csvFile.write(self.algorithmTable[indexInTable]+",") 
        
        step = 10 
        while step < featureNum:
            # pick top features
            filteredTrainData = self.attributeSelector(trainingData, step)
            
            
            # check top feature informations
            APIList = []  
            for item in filteredTrainData.attributes():
                #print str(item)
                functionName = str(item).split(" ")[1]
                #functionName = functionName.split("_")[0][1:] 
                APIList.append(functionName)
                
            numberOfInstance = self.getNumOfInstance(trainingData)
            
            
                
            # Get those features that it doesn't pick
            filteredList = []
            attributeIn = filteredTrainData.attributes()
            for item in attributeIn:
                functionName = str(item).split(" ")[1]
                functionName = functionName.split("(")[0] + "\(\)"
                functionName = functionName.replace('$','\$')
                filteredList.append(functionName)

            items = self.getItemsNotInTheList(attributeList, filteredList)
            #print len(items)
            #for item in items:
            #    print item
            # Re-process training data and make testing Data synchronized

            filteredTrainData = trainingData
            filterTestingData = testingData
            for attribute in items:                
                remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"])

                remove.set_inputformat(filteredTrainData)
                filteredTrainData = remove.filter(filteredTrainData)
                if filterTestingData:
                    remove.set_inputformat(filterTestingData)
                    filterTestingData = remove.filter(filterTestingData)
                #print attribute
                #print str(filteredTrainData.num_attributes() - 1)

            # Build classifier and evaluate it   
            classifier = self.algorithmPicker(filteredTrainData, indexInTable)    
            evaluation = self.evaluation(classifier, filteredTrainData, filterTestingData)
            #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredTrainData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))
            
            #Save best data and accuracy
            if evaluation.percent_correct() > bestAccuracy:
                bestAccuracy = evaluation.percent_correct()
                bestTrainData = filteredTrainData
                if testingData:
                    bestTestData = filterTestingData
                #bestEvaluation = evaluation
            step += 10
            
        
        
        classifier = self.algorithmPicker(trainingData, indexInTable)
        evaluation = self.evaluation(classifier, trainingData, testingData)
        #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))
        
        #Save best data and accuracy
        if evaluation.percent_correct() > bestAccuracy:
            bestAccuracy = evaluation.percent_correct()
            bestTrainData = filteredTrainData
            if testingData:
                bestTestData = filterTestingData
            #bestEvaluation = evaluation
        
        for item in resultList:
            outputStr += item +","
        outputStr = outputStr[0:-1] + "\n"
        self.writeToPath(csvFilePath, outputStr)
        return [bestAccuracy, bestTrainData, bestTestData, resultList]
def testing():
    logging.disable("weka")

    print "PROSES KLASIFIKASI\n------------------"

    jvm.start()

    pruning = 0
    while pruning < 2:

        persen_train = 0
        while persen_train < 4:

            fitur_hapus = 15
            while fitur_hapus >= 0:

                list_akurasi = []
                list_recall = []
                list_presisi = []
                list_fmeasure = []
                list_roc = []
                count = 0

                nama = "hasilTest/"
                if(pruning == 0):
                    nama += "unpruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"
                else:
                    nama += "pruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"

                if(fitur_hapus > 0):
                    nama += "removeF" + str(fitur_hapus) + ".txt"
                else:
                    nama += "normal.txt"

                f = open(nama, "w")

                if(pruning == 0):
                    nama = "unpruning"
                    print "Tanpa Pruning"
                    f.write("Hasil Decision Tree C4.5 tanpa Pruning (unpruning)\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")
                else:
                    nama = "pruning"
                    print "Dengan Pruning"
                    f.write("Hasil Decision Tree C4.5 Pruning\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")

                if(fitur_hapus > 0):
                    f.write("Menggunakan remove pada fitur " + str(fitur_hapus) + "\n\n")
                else:
                    f.write("\n")

                f.write("No. Akurasi Recall Presisi F-Measure ROC\n")

                if persen_train == 0:
                    print "40% Data Training"
                elif persen_train == 1:
                    print "50% Data Training"
                elif persen_train == 2:
                    print "60% Data Training"
                else:
                    print "70% Data Training"

                print "Fitur yang dihapus:", fitur_hapus
                print "\nNo.\tAkurasi\tRecall\tPresisi\tF-Measure\tROC"
                while count < 100:
                    loader = Loader(classname = "weka.core.converters.ArffLoader")
                    data = loader.load_file("hasil.arff")
                    data.class_is_last()

                    if(fitur_hapus > 0):
                        remove = Filter(classname = "weka.filters.unsupervised.attribute.Remove", options = ["-R", str(fitur_hapus)])
                        remove.inputformat(data)
                        data_baru = remove.filter(data)
                        data_baru.class_is_last()
                    else:
                        data_baru = loader.load_file("hasil.arff")
                        data_baru.class_is_last()

                    filter = Filter(classname = "weka.filters.unsupervised.instance.Randomize", options = ["-S", str(int(time.time()))])
                    filter.inputformat(data_baru)
                    data_random = filter.filter(data_baru)
                    data_random.class_is_last()

                    if(pruning == 0):
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-U"])
                    else:
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-C", "0.25"])

                    evaluation = Evaluation(data_random)
                    if(persen_train == 0):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 40)
                    elif(persen_train == 1):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 50)
                    elif(persen_train == 2):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 60)
                    else:
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 70)

                    f.write(str(count + 1) + str( ". " ) + str(evaluation.weighted_true_positive_rate) + str( " " ) + str(evaluation.weighted_recall) + str( " " ) + str(evaluation.weighted_precision) + str( " " ) + str(evaluation.weighted_f_measure) + str( " " ) + str(evaluation.weighted_area_under_roc) + "\n")
                    print count + 1, evaluation.weighted_true_positive_rate, evaluation.weighted_recall, evaluation.weighted_precision, evaluation.weighted_f_measure, evaluation.weighted_area_under_roc

                    list_akurasi.append(evaluation.weighted_true_positive_rate)
                    list_recall.append(evaluation.weighted_recall)
                    list_presisi.append(evaluation.weighted_precision)
                    list_fmeasure.append(evaluation.weighted_f_measure)
                    list_roc.append(evaluation.weighted_area_under_roc)

                    count += 1
                    time.sleep(1)

                list_akurasi.sort()
                list_recall.sort()
                list_presisi.sort()
                list_fmeasure.sort()
                list_roc.sort()

                f.write( ""  + "\n")
                f.write( "Rata-Rata"  + "\n")
                f.write( "Akurasi:" + str(sum(list_akurasi) / 100.0)  + "\n")
                f.write( "Recall:" + str(sum(list_recall) / 100.0)  + "\n")
                f.write( "Presisi:" + str(sum(list_presisi) / 100.0)  + "\n")
                f.write( "F-Measure:" + str(sum(list_fmeasure) / 100.0)  + "\n")
                f.write( "ROC:" + str(sum(list_roc) / 100.0)  + "\n")
                f.write( ""  + "\n")
                f.write( "Max"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[-1] ) + "\n")
                f.write( "Recall:" + str(list_recall[-1] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[-1] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[-1] ) + "\n")
                f.write( "ROC:" + str(list_roc[-1] ) + "\n")
                f.write( ""  + "\n")
                f.write( "Min"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[0] ) + "\n")
                f.write( "Recall:" + str(list_recall[0] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[0] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[0] ) + "\n")
                f.write( "ROC:" + str(list_roc[0] ) + "\n")
                f.write( ""  + "\n")

                print ""
                print "Rata-Rata"
                print "Akurasi:", sum(list_akurasi) / 100.0
                print "Recall:", sum(list_recall) / 100.0
                print "Presisi:", sum(list_presisi) / 100.0
                print "F-Measure:", sum(list_fmeasure) / 100.0
                print "ROC:", sum(list_roc) / 100.0
                print ""
                print "Max"
                print "Akurasi:", list_akurasi[-1]
                print "Recall:", list_recall[-1]
                print "Presisi:", list_presisi[-1]
                print "F-Measure:", list_fmeasure[-1]
                print "ROC:", list_roc[-1]
                print ""
                print "Min"
                print "Akurasi:", list_akurasi[0]
                print "Recall:", list_recall[0]
                print "Presisi:", list_presisi[0]
                print "F-Measure:", list_fmeasure[0]
                print "ROC:", list_roc[0]
                print ""

                f.close()
                fitur_hapus -= 1

            persen_train += 1

        pruning += 1

    jvm.stop()
Exemple #52
0
def run_classifier(path, prot, sel, cols, prot_vals, beta):
        
    DIs = dict()
    jvm.start()

    for i in range(len(cols)-1):
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file(path)
    
        # remove selected attribute from the data
        # NOTE: options are ONE indexed, not ZERO indexed
        remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                        options=["-R", str(sel[2]+1)])
        remove.inputformat(data)
        data = remove.filter(data)

        # if running for only one attribue, remove all others (except protected)
        if i > 0:
            for j in range(1, prot[2]+1):
                if i != j:
                    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                                    options=["-R", ("1" if i>j else "2")])
                    remove.inputformat(data)
                    data = remove.filter(data)

        # set prot attribute as Class attribute
        data.class_is_last()
        
        # run classifier
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(data)
    
        # count the number of each combination
        pos_and_pred = float(0.0)
        pos_and_not_pred = float(0.0)
        neg_and_pred = float(0.0)
        neg_and_not_pred = float(0.0)
        for ind, inst in enumerate(data):
            if cls.classify_instance(inst):
                if prot_vals[ind] == prot[1]:
                    pos_and_pred += 1
                else:
                    neg_and_pred += 1
            else:
                if prot_vals[ind] == prot[1]:
                    pos_and_not_pred += 1
                else:
                    neg_and_not_pred += 1

        # calculate DI
        BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \
               (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5
        if BER > 0.5:
            BER = 1 - BER
        DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER))

        if i == 0: # consider changing this to a 'code word' instead of 'all'
            DIs["all"] = DI
        else:
            DIs[cols[i-1]] = DI

    jvm.stop()

    return DIs
Exemple #53
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# plot
pld.scatter_plot(
    data, data.get_attribute_by_name("petalwidth").get_index(),
    data.get_attribute_by_name("petallength").get_index(),
    wait=False)

# add classifier errors to dataset
addcls = Filter(
    classname="weka.filters.supervised.attribute.AddClassification",
    options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"])
addcls.set_inputformat(data)
filtered = addcls.filter(data)
print(filtered)

# build J48
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(data)
evl = Evaluation(data)
evl.test_model(cls, data)

# plot classifier errors
plc.plot_classifier_errors(evl.predictions(), wait=True)

jvm.stop()

Exemple #54
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate J48
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("All attributes: %0.0f%%" % evl.percent_correct)

# remove attributes (1) and cross-validate J48
atts = "RI|Mg|Type"
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"])
flt.inputformat(data)
filtered = flt.filter(data)
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1))
print(atts + ": %0.0f%%" % evl.percent_correct)

# remove attributes (2) and cross-validate J48
atts = "RI|Na|Mg|Ca|Ba|Type"
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"])
flt.inputformat(data)
filtered = flt.filter(data)
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1))
print(atts + ": %0.0f%%" % evl.percent_correct)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris)

    # remove class attribute
    helper.print_info("Removing class attribute")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    remove.inputformat(data)
    filtered = remove.filter(data)

    # use MultiFilter
    helper.print_info("Use MultiFilter")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    std = Filter(classname="weka.filters.unsupervised.attribute.Standardize")
    multi = MultiFilter()
    multi.filters = [remove, std]
    multi.inputformat(data)
    filtered_multi = multi.filter(data)

    # output datasets
    helper.print_title("Input")
    print(data)
    helper.print_title("Output")
    print(filtered)
    helper.print_title("Output (MultiFilter)")
    print(filtered_multi)

    # load text dataset
    text = helper.get_data_dir() + os.sep + "reutersTop10Randomized_1perc_shortened.arff"
    helper.print_info("Loading dataset: " + text)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(text)
    data.class_is_last()

    # apply StringToWordVector
    stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer")
    stopwords = Stopwords(classname="weka.core.stopwords.Rainbow")
    tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
    s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"])
    s2wv.stemmer = stemmer
    s2wv.stopwords = stopwords
    s2wv.tokenizer = tokenizer
    s2wv.inputformat(data)
    filtered = s2wv.filter(data)

    helper.print_title("Input (StringToWordVector)")
    print(data)
    helper.print_title("Output (StringToWordVector)")
    print(filtered)

    # partial classname
    helper.print_title("Creating filter from partial classname")
    clsname = ".Standardize"
    f = Filter(classname=clsname)
    print(clsname + " --> " + f.classname)

    # source code
    helper.print_info("Generate source code")
    bolts = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + bolts)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(bolts)
    replace = Filter(classname="weka.filters.unsupervised.attribute.ReplaceMissingValues")
    replace.inputformat(data)
    replace.filter(data)
    print(replace.to_source("MyReplaceMissingValues", data))
Exemple #56
0
# close csvfile
csvfile.close()

# start JVM
jvm.start()

# load CSV file
loader = Loader(classname="weka.core.converters.CSVLoader", options=["-E", '"', "-F", ","])
data = loader.load_file(csvfilename)
#print(data)

# convert class to nominal
wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# convert content to string
wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# set class attribute
data.set_class_index(data.num_attributes() - 1)

# generate baseline
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
evaluation = Evaluation(data)
evaluation.crossvalidate_model(zeror, data, 10, Random(1))
print("\nBaseline:\n" + evaluation.to_summary())
Exemple #57
0
# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# simulate the 10 train/test pairs of cross-validation
evl = Evaluation(data)
for i in xrange(1, 11):
    # create train set
    remove = Filter(
        classname="weka.filters.supervised.instance.StratifiedRemoveFolds",
        options=["-N", "10", "-F", str(i), "-S", "1", "-V"])
    remove.inputformat(data)
    train = remove.filter(data)

    # create test set
    remove = Filter(
        classname="weka.filters.supervised.instance.StratifiedRemoveFolds",
        options=["-N", "10", "-F", str(i), "-S", "1"])
    remove.inputformat(data)
    test = remove.filter(data)

    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    evl.test_model(cls, test)

print("Simulated CV accuracy: %0.1f%%" % evl.percent_correct)

# perform actual cross-validation
Exemple #58
0
 def remove_attributes(self, *attributes):
     indices = [self.attribute_index(x) for x in attributes]
     remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", ','.join(str(x + 1) for x in indices)])
     remove.inputformat(self.instances)
     self.instances = remove.filter(self.instances)
Exemple #59
0
from weka.classifiers import Classifier, Evaluation, PredictionOutput
from weka.filters import Filter

jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
# we'll set the class attribute after filtering

# apply NominalToBinary filter and set class attribute
fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary")
fltr.inputformat(data)
filtered = fltr.filter(data)
filtered.class_is_last()

# cross-validate LinearRegression on filtered data, display model
cls = Classifier(classname="weka.classifiers.functions.LinearRegression")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)
print("10-fold cross-validation:\n" + evl.summary())
print("Predictions:\n\n" + str(pout))
cls.build_classifier(filtered)
print("Model:\n\n" + str(cls))

# use AddClassification filter with LinearRegression on filtered data
print("Applying AddClassification to filtered data:\n")
fltr = Filter(
    def attribueSelectionBasedOnRankingInDatabase(self, trainingData, indexInTable, databaseTable, csvFilePath, testingData = None):     
        featureNum = trainingData.num_attributes() - 1
        outputStr = ""
        outputStr += databaseTable+","

        # select from database vector difference
        featureList3 = []
        wholefeatureList = []
        dbmgr = permissionMappingManager(databasePath)

        for row in dbmgr.query("select * from " + databaseTable):
            featureList3.append(row[0])
            wholefeatureList.append(row[0])
        #featureList3.reverse()
        
        bestRemainFilterList = []
        resultList = []
        digit = len(featureList3) % 10

        bestAccuracy = 0
        bestTrainingData = None
        bestTestingData = None
        bestEvaluation = None
        
        classifier = self.algorithmPicker(trainingData, indexInTable)
        evaluation = self.evaluation(classifier, trainingData, testingData)
        if evaluation.percent_correct() >= bestAccuracy:
            bestAccuracy = evaluation.percent_correct()
            bestTrainingData = trainingData
            bestTestingData = testingData
            bestRemainFilterList = list(featureList3)
            bestEvaluation = evaluation
            
        print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))
        
        if digit > 0:
            for i in range(0, digit):
                functionName = featureList3.pop().split("(")[0] + "\(\)"
                functionName = functionName.replace('$','\$')
                #print "functionName:" + functionName
                remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"])
                remove.set_inputformat(trainingData)
                trainingData = remove.filter(trainingData)
                if testingData:
                    remove.set_inputformat(testingData)
                    testingData = remove.filter(testingData)
                
                #print "i:" + str(i)
                #print "functionName:" + functionName
                #print "featureNum: " + str(filteredData.num_attributes() - 1)
            #for attributeStr in trainingData.attributes():
            #    print(attributeStr)
            #self.printFunctionInfo(trainingData, trainingData.num_instances())
            
            classifier = self.algorithmPicker(trainingData, indexInTable)
            evaluation = self.evaluation(classifier, trainingData, testingData)
            if evaluation.percent_correct() >= bestAccuracy:
                bestAccuracy = evaluation.percent_correct()
                bestTrainingData = trainingData
                bestTestingData = testingData
                bestRemainFilterList = list(featureList3)
                bestEvaluation = evaluation
                
            print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))
            
        while trainingData.num_attributes() - 1 > 10:
            for i in range(0,10):
                functionName = featureList3.pop().split("(")[0] + "\(\)"
                functionName = functionName.replace('$','\$')
                #print "functionName:" + functionName
                remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"])
                remove.set_inputformat(trainingData)
                trainingData = remove.filter(trainingData)
                if testingData:
                    remove.set_inputformat(testingData)
                    testingData = remove.filter(testingData)
                #print functionName
                #print "featureNum: " + str(filteredData.num_attributes() - 1)
                
            #for attributeStr in trainingData.attributes():
            #    print(attributeStr)
            
            classifier = self.algorithmPicker(trainingData, indexInTable)
            evaluation = self.evaluation(classifier, trainingData, testingData)
            if evaluation.percent_correct() >= bestAccuracy:
                
                bestAccuracy = evaluation.percent_correct()
                bestTrainingData = trainingData
                bestTestingData = testingData
                bestRemainFilterList = list(featureList3)
                bestEvaluation = evaluation
                #print "update feature number:" + str(len(bestRemainFilterList))
                
            print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))

        resultList.reverse()
        
        fileteredfeatureList = []
        #print "bestRemainFilterList number:" + str(len(bestRemainFilterList))
        #print "wholefeatureList number:" + str(len(wholefeatureList))
        for item in wholefeatureList:
            if item not in bestRemainFilterList:
                fileteredfeatureList.append(item)
                
        #print "update fileteredfeatureList number:" + str(len(fileteredfeatureList))
        for item in resultList:
            outputStr += item +","
        outputStr = outputStr[0:-1] + "\n"
        
        print outputStr
        self.writeToPath(csvFilePath, outputStr)
        accuracyStr = "{:.2f}".format(bestAccuracy)
        #print fileteredfeatureList
        return [bestEvaluation, bestTrainingData, bestTestingData, resultList]