def readFeature(num_features,type,select_feature,numtrees):
    #filename1=resultFileTest
    #filename2=resultFileTest2
    filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv'
    filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv'
    #print filename1
    loader=CSVLoader()
    loader.setSource(File(filename1))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    
    rf.buildClassifier(data)
   
    #print rf
    loader.setSource(File(filename2))
    

    test_data=Instances(loader.getDataSet())
    
    test_data.setClassIndex(test_data.numAttributes()-1)

    
    ''' num=test_data.numInstances()

    
    print num
   
    for i in xrange(num):

        r1=rf.distributionForInstance(test_data.instance(i))
  
        r2=rf.classifyInstance(test_data.instance(i))

        ptrixrint r1 
          
           print r2'''
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(test_data)
    output.setBuffer(buffer)
    
    attRange = Range()  # attributes to output
    outputDistribution = Boolean(True)
    evaluator=Evaluation(data)
    evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution])
    #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)])
    #evaluator1=Evaluation(test_data)
    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def readCross(num,type,numtrees):

    filename=resultFile+'_'+type+'_'+num+'_all.csv'
    loader=CSVLoader()
    loader.setSource(File(filename))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) 
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(data)
    output.setBuffer(buffer)
    output.setOutputDistribution(True) 
    attRange = Range()  # attributes to output
    outputDistributions = Boolean(True)
    evaluator=Evaluation(data) 
    
    evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions])
    

    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
Esempio n. 3
0
def evaluate_dataset(classifier,data):
    evaluation = Evaluation(data)
    output = PlainText()
    output.setHeader(data)
    eval_buffer = StringBuffer() # buffer to use
    output.setBuffer(eval_buffer)
    options = [output]
    evaluation.evaluateModel(classifier,data,options)
    return evaluation
logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log"
log=open(logfile, 'w', bufsize) # open general log file

# loop for different values of x using full dataset
data.setClassIndex(data.numAttributes() - 1)
for num in [x * 0.05 for x in range(0, 10)]:
   log.write("---------------------------------\nCF: " + str(num) + "\n")
   algo = J48()
   x = time.time()
   algo.buildClassifier(data)
   log.write("Time to build classifier: " + str(time.time() - x) + "\n")
   algo.setConfidenceFactor(num)
   evaluation = Evaluation(data)
   output = PlainText()  # plain text output for predictions
   output.setHeader(data)
   buffer = StringBuffer() # buffer to use
   output.setBuffer(buffer)
   attRange = Range()                  # no additional attributes output
   outputDistribution = Boolean(False) # we don't want distribution
   x = time.time()
   evaluation.evaluateModel(algo, data, [output, attRange, outputDistribution])
   #evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution]) 
   log.write("Time to evaluate model: " + str(time.time() - x) + "\n")
   log.write(evaluation.toSummaryString())
   file.write(str(num) + "," + str(evaluation.rootMeanSquaredError()) + "\n")
   # create graph
   graphfilename = "image/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_" + str(num) + ".dot"
   graphfile = open(graphfilename, 'wb')
Esempio n. 5
0
# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: UsingJ48Ext.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# create the model
evaluation = Evaluation(data)
output = PlainText()  # plain text output for predictions
output.setHeader(data)
buffer = StringBuffer()  # buffer to use
output.setBuffer(buffer)
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
j48 = J48()
j48.buildClassifier(data)  # only a trained classifier can be evaluated
evaluation.evaluateModel(j48, data, [output, attRange, outputDistribution])

# print out the built model
print "--> Generated model:\n"
print j48

print "--> Evaluation:\n"
print evaluation.toSummaryString()
Esempio n. 6
0
filelimit=open(datafilelimit, 'w', bufsize)
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log=open(logfile, 'w', bufsize) # open general log file

for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))):
   filelimit.write(str(num))
   trainset = Instances(fulltrainset,0,num)   # create training set 
   trainset.setClassIndex(trainset.numAttributes() - 1)
   log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n")
   for dataset in [testset, fulltrainset]:   
       algo = J48()
       algo.buildClassifier(trainset)
       algo.setConfidenceFactor(float(p['j48.C']))
       evaluation = Evaluation(trainset)
       output = PlainText()  # plain text output for predictions
       output.setHeader(trainset)
       buffer = StringBuffer() # buffer to use
       output.setBuffer(buffer)
       attRange = Range()                  # no additional attributes output
       outputDistribution = Boolean(False) # we don't want distribution
       x = time.time()
       if (int(crossvalidate)):
           evaluation.crossValidateModel(algo, dataset, 10, rand, [output, attRange, outputDistribution])
       else:
           evaluation.evaluateModel(algo, dataset, [output, attRange, outputDistribution])
       log.write("Time to evaluate model: " + str(time.time() - x) + "\n")
       log.write(evaluation.toSummaryString())
       filelimit.write("," + str(evaluation.pctIncorrect()))
   filelimit.write("\n")
filelimit.close()