def readFeature(num_features,type,select_feature,numtrees): #filename1=resultFileTest #filename2=resultFileTest2 filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv' filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv' #print filename1 loader=CSVLoader() loader.setSource(File(filename1)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) rf.buildClassifier(data) #print rf loader.setSource(File(filename2)) test_data=Instances(loader.getDataSet()) test_data.setClassIndex(test_data.numAttributes()-1) ''' num=test_data.numInstances() print num for i in xrange(num): r1=rf.distributionForInstance(test_data.instance(i)) r2=rf.classifyInstance(test_data.instance(i)) ptrixrint r1 print r2''' buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(test_data) output.setBuffer(buffer) attRange = Range() # attributes to output outputDistribution = Boolean(True) evaluator=Evaluation(data) evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution]) #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)]) #evaluator1=Evaluation(test_data) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def readCross(num,type,numtrees): filename=resultFile+'_'+type+'_'+num+'_all.csv' loader=CSVLoader() loader.setSource(File(filename)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(data) output.setBuffer(buffer) output.setOutputDistribution(True) attRange = Range() # attributes to output outputDistributions = Boolean(True) evaluator=Evaluation(data) evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions]) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
def evaluate_dataset(classifier,data): evaluation = Evaluation(data) output = PlainText() output.setHeader(data) eval_buffer = StringBuffer() # buffer to use output.setBuffer(eval_buffer) options = [output] evaluation.evaluateModel(classifier,data,options) return evaluation
logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log" log=open(logfile, 'w', bufsize) # open general log file # loop for different values of x using full dataset data.setClassIndex(data.numAttributes() - 1) for num in [x * 0.05 for x in range(0, 10)]: log.write("---------------------------------\nCF: " + str(num) + "\n") algo = J48() x = time.time() algo.buildClassifier(data) log.write("Time to build classifier: " + str(time.time() - x) + "\n") algo.setConfidenceFactor(num) evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution x = time.time() evaluation.evaluateModel(algo, data, [output, attRange, outputDistribution]) #evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution]) log.write("Time to evaluate model: " + str(time.time() - x) + "\n") log.write(evaluation.toSummaryString()) file.write(str(num) + "," + str(evaluation.rootMeanSquaredError()) + "\n") # create graph graphfilename = "image/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_" + str(num) + ".dot" graphfile = open(graphfilename, 'wb')
# check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: UsingJ48Ext.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [output, attRange, outputDistribution]) # print out the built model print "--> Generated model:\n" print j48 print "--> Evaluation:\n" print evaluation.toSummaryString()
filelimit=open(datafilelimit, 'w', bufsize) filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log=open(logfile, 'w', bufsize) # open general log file for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))): filelimit.write(str(num)) trainset = Instances(fulltrainset,0,num) # create training set trainset.setClassIndex(trainset.numAttributes() - 1) log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n") for dataset in [testset, fulltrainset]: algo = J48() algo.buildClassifier(trainset) algo.setConfidenceFactor(float(p['j48.C'])) evaluation = Evaluation(trainset) output = PlainText() # plain text output for predictions output.setHeader(trainset) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution x = time.time() if (int(crossvalidate)): evaluation.crossValidateModel(algo, dataset, 10, rand, [output, attRange, outputDistribution]) else: evaluation.evaluateModel(algo, dataset, [output, attRange, outputDistribution]) log.write("Time to evaluate model: " + str(time.time() - x) + "\n") log.write(evaluation.toSummaryString()) filelimit.write("," + str(evaluation.pctIncorrect())) filelimit.write("\n") filelimit.close()