def readFeature(num_features,type,select_feature,numtrees):
    #filename1=resultFileTest
    #filename2=resultFileTest2
    filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv'
    filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv'
    #print filename1
    loader=CSVLoader()
    loader.setSource(File(filename1))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    
    rf.buildClassifier(data)
   
    #print rf
    loader.setSource(File(filename2))
    

    test_data=Instances(loader.getDataSet())
    
    test_data.setClassIndex(test_data.numAttributes()-1)

    
    ''' num=test_data.numInstances()

    
    print num
   
    for i in xrange(num):

        r1=rf.distributionForInstance(test_data.instance(i))
  
        r2=rf.classifyInstance(test_data.instance(i))

        ptrixrint r1 
          
           print r2'''
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(test_data)
    output.setBuffer(buffer)
    
    attRange = Range()  # attributes to output
    outputDistribution = Boolean(True)
    evaluator=Evaluation(data)
    evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution])
    #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)])
    #evaluator1=Evaluation(test_data)
    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
    def load_arff(self, arff):
        file = FileReader(arff)

        #fis = FileInputStream(arff)
        #file = InputStreamReader(fis, "UTF-8");

        #fr = FileReader(arff)
        #file = BufferedReader(fr)
        
        data = Instances(file)
        data.setClassIndex(data.numAttributes() - 1)
        return data
      weka.classifiers.Evaluation class)

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: UsingJ48Ext.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# create the model
evaluation = Evaluation(data)
buffer = StringBuffer()  # buffer for the predictions
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
j48 = J48()
j48.buildClassifier(data)  # only a trained classifier can be evaluated
evaluation.evaluateModel(j48, data, [buffer, attRange, outputDistribution])

# print out the built model
print "--> Generated model:\n"
print j48

print "--> Evaluation:\n"
    sys.exit()
crossvalidate = sys.argv[2]
rand = Random()              # seed from the system time

# load properties
p = Properties()
p.load(open('./ml.properties'))

# load data file
print "Loading data..."
trainfile = FileReader(sys.argv[1] + "-train.arff")
print "Loading " + sys.argv[1] + "-train.arff"
testfile = FileReader(sys.argv[1] + "-test.arff")
print "Loading " + sys.argv[1] + "-test.arff"
fulltrainset = Instances(trainfile)
fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1)
testset = Instances(testfile)
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize=0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit=open(datafilelimit, 'w', bufsize)
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log=open(logfile, 'w', bufsize) # open general log file

for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))):
   filelimit.write(str(num))
  	if (numReqOpt < 2):
    		usage()
    		return 1


        options = {'idFlag':True, 'weightFlag': False, 'rmClassFlag': False, 'rmClass': 0}
        # read the first dataset
        fn = inputList[0]
        fid = FileReader(fn)
	Data = Instances(fid)
        Data, IDs = PreprocessData(Data,options)
        # remove class label
        attributeremove = AttributeRemove()
        attributeremove.setInvertSelection(Boolean(False))  # remove class labels from dataset
        attributeremove.setAttributeIndices(String(str(Data.numAttributes())))
        attributeremove.setInputFormat(Data)
        newData = Filter.useFilter(Data, attributeremove)
        # loop over input arff file
        cnt = Data.numAttributes() 
        for fnCnt in range(1,len(inputList)):
             fn = inputList[fnCnt]
             fid = FileReader(fn)
	     Data = Instances(fid)
             Data, IDs = PreprocessData(Data,options)
             # remove class label
             attributeremove = AttributeRemove()
	     attributeremove.setInvertSelection(Boolean(True))  # remove every attribute but the last one which is class label
	     attributeremove.setAttributeIndices(String(str(Data.numAttributes())))
	     attributeremove.setInputFormat(Data)
	     labels = Filter.useFilter(Data, attributeremove)
# load data file
print "Loading data..."
datafile = FileReader(sys.argv[1])
data = Instances(datafile)
rand = Random()              # seed from the system time
data.randomize(rand)         # randomize data with number generator

# open output files
bufsize=0
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])

# loop for different amounts of data with fixed test set
datasize = data.numInstances()
limit = (datasize*2)/3   # loop until we use 2/3 data as training set
testset = Instances(data,limit,datasize-limit)   # create training set using the last 1/3 of data
testset.setClassIndex(testset.numAttributes() - 1)

saver = ArffSaver()
saver.setInstances(testset)
testsetfile = "./data/split/" + dataname + "-" + "test.arff"
file = File(testsetfile)
saver.setFile(file)
saver.writeBatch()

trainset = Instances(data,0,limit)   # create training set
saver = ArffSaver()
saver.setInstances(trainset)
trainsetfile = "./data/split/" + dataname + "-" + "train.arff"
file = File(trainsetfile)
saver.setFile(file)
saver.writeBatch()
Exemple #7
0
Based on this code example:

    http://www.btbytes.com/2005/11/30/weka-j48-classifier-example-using-jython/

Commandline parameter(s):

    first parameter must be the ARFF file one wants to process with J48

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: UsingJ48.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# create the model
print "Training J48..."
j48 = J48()
j48.buildClassifier(data)

# print out the built model
print "Generated model:\n"
print j48
print "Loading data..."
datafile = FileReader(sys.argv[1])
data = Instances(datafile)
rand = Random()  # seed from the system time
data.randomize(rand)  # randomize data with number generator

# open output files
bufsize = 0
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])

# loop for different amounts of data with fixed test set
datasize = data.numInstances()
limit = (datasize * 2) / 3  # loop until we use 2/3 data as training set
testset = Instances(data, limit, datasize -
                    limit)  # create training set using the last 1/3 of data
testset.setClassIndex(testset.numAttributes() - 1)

saver = ArffSaver()
saver.setInstances(testset)
testsetfile = "./data/split/" + dataname + "-" + "test.arff"
file = File(testsetfile)
saver.setFile(file)
saver.writeBatch()

trainset = Instances(data, 0, limit)  # create training set
saver = ArffSaver()
saver.setInstances(trainset)
trainsetfile = "./data/split/" + dataname + "-" + "train.arff"
file = File(trainsetfile)
saver.setFile(file)
saver.writeBatch()
    sys.exit()
crossvalidate = sys.argv[2]
rand = Random()  # seed from the system time

# load properties
p = Properties()
p.load(open('./ml.properties'))

# load data file
print "Loading data..."
trainfile = FileReader(sys.argv[1] + "-train.arff")
print "Loading " + sys.argv[1] + "-train.arff"
testfile = FileReader(sys.argv[1] + "-test.arff")
print "Loading " + sys.argv[1] + "-test.arff"
fulltrainset = Instances(trainfile)
fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1)
testset = Instances(testfile)
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize = 0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit = open(datafilelimit, 'w', bufsize)
filelimit.write(
    "instances,letest,letrain,lmtest,lmtrain,kdtest,kdtrain,balltest,balltrain,covertest,covertrain\n"
)
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log = open(logfile, 'w', bufsize)  # open general log file