Esempio n. 1
0
 def learn(self, classAttr, unpruned=False, minNumObj=2):
     self.instances = self._getInstances(classAttr)
     j48 = J48()
     j48.setUnpruned(unpruned)
     j48.setMinNumObj(minNumObj)
     #self.j48.setConfidenceFactor(1.0)
     j48.buildClassifier(self.instances)
     self.classifier = j48
     print j48
Esempio n. 2
0
    def learn(self, classAttr, unpruned=False, minNumObj=2):
        self.instances = self._getInstances(classAttr)
        j48 = J48()
        j48.setUnpruned(unpruned)
        j48.setMinNumObj(minNumObj)
        classifier = Bagging()  #RandomForest()
        classifier.setClassifier(j48)
        classifier.buildClassifier(self.instances)
        self.classifier = classifier

        print classifier
Esempio n. 3
0
    def learn(self, classAttr, unpruned=False, minNumObj=2):
        self.instances = self._getInstances(classAttr)
        tree = J48()  # DecisionStump() #J48

        classifier = AdaBoostM1()
        #classifier.setDebug(true);
        classifier.setClassifier(tree)
        #classifier.setNumIterations(50)

        #self.j48.setConfidenceFactor(1.0)
        classifier.buildClassifier(self.instances)
        self.classifier = classifier

        print "numIterations", classifier.getNumIterations()
        print classifier
Esempio n. 4
0
# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# create the model
evaluation = Evaluation(data)
output = PlainText()  # plain text output for predictions
output.setHeader(data)
buffer = StringBuffer()  # buffer to use
output.setBuffer(buffer)
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
j48 = J48()
j48.buildClassifier(data)  # only a trained classifier can be evaluated
evaluation.evaluateModel(j48, data, [output, attRange, outputDistribution])

# print out the built model
print "--> Generated model:\n"
print j48

print "--> Evaluation:\n"
print evaluation.toSummaryString()

print "--> Predictions:\n"
print buffer
# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# define the algorithms to be used.
algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(), 'BayesNet'),
             (J48(), 'J48'), (JRip(), 'JRip'), (KStar(), 'KStar'),
             (RandomForest(), 'RandomForest'), (AdaBoostM1(), 'AdaBoostM1'),
             (MultilayerPerceptron(), 'MultilayerPerceptron'),
             (LibSVM(), 'LibSVM')]
algo_dict = dict([(x[1], x[0]) for x in algo_list])
algo_keys = [
    'NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar',
    'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron'
]

# example to set kernal type on libsvm.  Default is 2
#algo = algo_dict['LibSVM']
#tag = SelectedTag("1",algo.TAGS_KERNELTYPE)  # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid
#algo.setKernelType(tag)

# train classifiers but filter out the name column first
bufsize = 0

datafile = "data/plot/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_rmse.csv"
file = open(datafile, 'w', bufsize)
file.write("cf,rmse\n")

logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log"
log = open(logfile, 'w', bufsize)  # open general log file

# loop for different values of x using full dataset
data.setClassIndex(data.numAttributes() - 1)
for num in [x * 0.05 for x in range(0, 10)]:
    log.write("---------------------------------\nCF: " + str(num) + "\n")
    algo = J48()
    x = time.time()
    algo.buildClassifier(data)
    log.write("Time to build classifier: " + str(time.time() - x) + "\n")
    algo.setConfidenceFactor(num)
    evaluation = Evaluation(data)
    output = PlainText()  # plain text output for predictions
    output.setHeader(data)
    buffer = StringBuffer()  # buffer to use
    output.setBuffer(buffer)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    x = time.time()
    evaluation.evaluateModel(algo, data,
                             [output, attRange, outputDistribution])
    #evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution])
Esempio n. 7
0
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log=open(logfile, 'w', bufsize) # open general log file
timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv"
timefile = open(timefilename, 'w', bufsize)
timefile.write("instances,timetest,timetrain\n")

for num in range(int(p['adaboost.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['adaboost.numdatapoints']))):
   trainset = Instances(fulltrainset,0,num)   # create training set
   trainset.setClassIndex(trainset.numAttributes() - 1)
   log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n")
   filelimit.write(str(num))
   timefile.write(str(num))
   for dataset in [testset, fulltrainset]:
       algo = AdaBoostM1()
       weaklearner = J48()
       algo.setClassifier(weaklearner)
       algo.setNumIterations(int(p['adaboost.iterations']))
       x = time.time()
       algo.buildClassifier(trainset)
       evaluation = Evaluation(trainset)
       timefile.write("," + str(time.time() - x))
       output = PlainText()  # plain text output for predictions
       output.setHeader(trainset)
       buffer = StringBuffer() # buffer to use
       output.setBuffer(buffer)
       attRange = Range()                  # no additional attributes output
       outputDistribution = Boolean(False) # we don't want distribution
       x = time.time()
       if (int(crossvalidate)):
           evaluation.crossValidateModel(algo, dataset, 10, rand, [output, attRange, outputDistribution])
# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# define the algorithms to be used.
algo_list = [(NaiveBayes(), 'NaiveBayes'), (BayesNet(),'BayesNet'), (J48(),'J48'), (JRip(), 'JRip'),
                 (KStar(), 'KStar'), (RandomForest(), 'RandomForest'), (AdaBoostM1(),'AdaBoostM1'),
                 (MultilayerPerceptron(),'MultilayerPerceptron'), (LibSVM(), 'LibSVM')]
algo_dict = dict([(x[1], x[0]) for x in algo_list])
algo_keys = ['NaiveBayes', 'J48', 'BayesNet', 'JRip', 'RandomForest', 'KStar', 'AdaBoostM1', 'LibSVM', 'MultilayerPerceptron']

# example to set kernal type on libsvm.  Default is 2
algo = algo_dict['LibSVM']
tag = SelectedTag("1",algo.TAGS_KERNELTYPE)  # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid
algo.setKernelType(tag)

# train classifiers
print "Training classifiers..."
for key in algo_keys :
   algo = algo_dict[key]
   algo.buildClassifier(data)