def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict): """ If <test_filename> Run classifier algorithm <algo> on training data in <training_filename> to build a model then test on data in <test_filename> (equivalent of Weka "Supplied test set") else do 10 fold CV lassifier algorithm <algo> on data in <training_filename> <class_index> is the column containing the dependent variable http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html """ print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename misc.checkExists(training_filename) training_file = FileReader(training_filename) training_data = Instances(training_file) if test_filename: test_file = FileReader(test_filename) test_data = Instances(test_file) else: test_data = training_data # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model if test_filename: algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution if test_filename: evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) else: # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')]) # print evaluation.toSummaryString() rand = Random(1) evaluation.crossValidateModel(algo, training_data, 4, rand) if False: print 'percentage correct =', evaluation.pctCorrect() print 'area under ROC =', evaluation.areaUnderROC(class_index) confusion_matrix = evaluation.confusionMatrix() for l in confusion_matrix: print '** ', ','.join('%2d'%int(x) for x in l) if verbose: if do_model: print '--> Generated model:\n' print algo.toString() if do_eval: print '--> Evaluation:\n' print evaluation.toSummaryString() if do_predict: print '--> Predictions:\n' print buffer return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
def readFeature(num_features,type,numtrees): #filename1=resultFileTest #filename2=resultFileTest2 filename1=resultFile+'_'+type+'_'+num_features+'_train.arff' filename2=resultFile+'_'+type+'_'+num_features+'_test.arff' #print filename1 #loader=CSVLoader() #loader.setSource(File(filename1)) #data=loader.getDataSet() #print data.numAttributes() print "Loading data......" train_file=FileReader(filename1) train_data=Instances(train_file) train_data.setClassIndex(train_data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) rf.buildClassifier(train_data) #print rf #loader.setSource(File(filename2)) #test_data=Instances(loader.getDataSet()) # test_data.setClassIndex(test_data.numAttributes()-1) test_file=FileReader(filename2) test_data=Instances(test_file) test_data.setClassIndex(test_data.numAttributes()-1) ''' num=test_data.numInstances() print num for i in xrange(num): r1=rf.distributionForInstance(test_data.instance(i)) r2=rf.classifyInstance(test_data.instance(i)) ptrixrint r1 print r2''' buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(test_data) output.setBuffer(buffer) attRange = Range() # attributes to output outputDistribution = Boolean(True) evaluator=Evaluation(train_data) evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution]) #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)]) #evaluator1=Evaluation(test_data) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.precision(0),evaluator.recall(0),evaluator.fMeasure(0),evaluator.matthewsCorrelationCoefficient(0),evaluator.numTruePositives(0),evaluator.numFalsePositives(0),evaluator.numTrueNegatives(0),evaluator.numFalseNegatives(0),evaluator.areaUnderROC(0)]
def readCross(num,type,select_feature,numtrees): filename=resultFile+'_'+type+'_'+num+'_'+select_feature+'_all.csv' loader=CSVLoader() loader.setSource(File(filename)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(data) output.setBuffer(buffer) output.setOutputDistribution(True) attRange = Range() # attributes to output outputDistributions = Boolean(True) evaluator=Evaluation(data) evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions]) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]