def learn(self, classAttr, unpruned=False, minNumObj=2): self.instances = self._getInstances(classAttr) j48 = J48() j48.setUnpruned(unpruned) j48.setMinNumObj(minNumObj) #self.j48.setConfidenceFactor(1.0) j48.buildClassifier(self.instances) self.classifier = j48 print j48
def learn(self, classAttr, unpruned=False, minNumObj=2): self.instances = self._getInstances(classAttr) j48 = J48() j48.setUnpruned(unpruned) j48.setMinNumObj(minNumObj) classifier = Bagging() #RandomForest() classifier.setClassifier(j48) classifier.buildClassifier(self.instances) self.classifier = classifier print classifier
def learn(self, classAttr, unpruned=False, minNumObj=2): self.instances = self._getInstances(classAttr) tree = J48() # DecisionStump() #J48 classifier = AdaBoostM1() #classifier.setDebug(true); classifier.setClassifier(tree) #classifier.setNumIterations(50) #self.j48.setConfidenceFactor(1.0) classifier.buildClassifier(self.instances) self.classifier = classifier print "numIterations", classifier.getNumIterations() print classifier
# load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [output, attRange, outputDistribution]) # print out the built model print "--> Generated model:\n" print j48 print "--> Evaluation:\n" print evaluation.toSummaryString() print "--> Predictions:\n" print buffer
# check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # define the algorithms to be used. algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(), 'BayesNet'), (J48(), 'J48'), (JRip(), 'JRip'), (KStar(), 'KStar'), (RandomForest(), 'RandomForest'), (AdaBoostM1(), 'AdaBoostM1'), (MultilayerPerceptron(), 'MultilayerPerceptron'), (LibSVM(), 'LibSVM')] algo_dict = dict([(x[1], x[0]) for x in algo_list]) algo_keys = [ 'NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar', 'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron' ] # example to set kernal type on libsvm. Default is 2 #algo = algo_dict['LibSVM'] #tag = SelectedTag("1",algo.TAGS_KERNELTYPE) # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid #algo.setKernelType(tag) # train classifiers but filter out the name column first
bufsize = 0 datafile = "data/plot/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_rmse.csv" file = open(datafile, 'w', bufsize) file.write("cf,rmse\n") logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log" log = open(logfile, 'w', bufsize) # open general log file # loop for different values of x using full dataset data.setClassIndex(data.numAttributes() - 1) for num in [x * 0.05 for x in range(0, 10)]: log.write("---------------------------------\nCF: " + str(num) + "\n") algo = J48() x = time.time() algo.buildClassifier(data) log.write("Time to build classifier: " + str(time.time() - x) + "\n") algo.setConfidenceFactor(num) evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution x = time.time() evaluation.evaluateModel(algo, data, [output, attRange, outputDistribution]) #evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution])
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log=open(logfile, 'w', bufsize) # open general log file timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv" timefile = open(timefilename, 'w', bufsize) timefile.write("instances,timetest,timetrain\n") for num in range(int(p['adaboost.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['adaboost.numdatapoints']))): trainset = Instances(fulltrainset,0,num) # create training set trainset.setClassIndex(trainset.numAttributes() - 1) log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n") filelimit.write(str(num)) timefile.write(str(num)) for dataset in [testset, fulltrainset]: algo = AdaBoostM1() weaklearner = J48() algo.setClassifier(weaklearner) algo.setNumIterations(int(p['adaboost.iterations'])) x = time.time() algo.buildClassifier(trainset) evaluation = Evaluation(trainset) timefile.write("," + str(time.time() - x)) output = PlainText() # plain text output for predictions output.setHeader(trainset) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution x = time.time() if (int(crossvalidate)): evaluation.crossValidateModel(algo, dataset, 10, rand, [output, attRange, outputDistribution])
# check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # define the algorithms to be used. algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(),'BayesNet'), (J48(),'J48'), (JRip(), 'JRip'), (KStar(), 'KStar'), (RandomForest(), 'RandomForest'), (AdaBoostM1(),'AdaBoostM1'), (MultilayerPerceptron(),'MultilayerPerceptron'), (LibSVM(), 'LibSVM')] algo_dict = dict([(x[1], x[0]) for x in algo_list]) algo_keys = ['NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar', 'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron'] # example to set kernal type on libsvm. Default is 2 algo = algo_dict['LibSVM'] tag = SelectedTag("1",algo.TAGS_KERNELTYPE) # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid algo.setKernelType(tag) # train classifiers print "Training classifiers..." for key in algo_keys : algo = algo_dict[key] algo.buildClassifier(data)