Esempio n. 1
0
fulltrainset = Instances(trainfile)
fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1)
testset = Instances(testfile)
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize=0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit=open(datafilelimit, 'w', bufsize)
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log=open(logfile, 'w', bufsize) # open general log file

for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))):
   filelimit.write(str(num))
   trainset = Instances(fulltrainset,0,num)   # create training set 
   trainset.setClassIndex(trainset.numAttributes() - 1)
   log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n")
   for dataset in [testset, fulltrainset]:   
       algo = J48()
       algo.buildClassifier(trainset)
       algo.setConfidenceFactor(float(p['j48.C']))
       evaluation = Evaluation(trainset)
       output = PlainText()  # plain text output for predictions
       output.setHeader(trainset)
       buffer = StringBuffer() # buffer to use
       output.setBuffer(buffer)
       attRange = Range()                  # no additional attributes output
       outputDistribution = Boolean(False) # we don't want distribution
Esempio n. 2
0
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
datafile = FileReader(sys.argv[1])
data = Instances(datafile)
rand = Random()              # seed from the system time
data.randomize(rand)         # randomize data with number generator

# open output files
bufsize=0
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])

# loop for different amounts of data with fixed test set
datasize = data.numInstances()
limit = (datasize*2)/3   # loop until we use 2/3 data as training set
testset = Instances(data,limit,datasize-limit)   # create training set using the last 1/3 of data
testset.setClassIndex(testset.numAttributes() - 1)

saver = ArffSaver()
saver.setInstances(testset)
testsetfile = "./data/split/" + dataname + "-" + "test.arff"
file = File(testsetfile)
saver.setFile(file)
saver.writeBatch()

trainset = Instances(data,0,limit)   # create training set
saver = ArffSaver()
saver.setInstances(trainset)
trainsetfile = "./data/split/" + dataname + "-" + "train.arff"
Esempio n. 3
0
testset = Instances(testfile)
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize = 0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit = open(datafilelimit, 'w', bufsize)
filelimit.write(
    "instances,lineartest,lineartrain,polytest,polytrain,radialtest,radialtrain,sigmoidtest,sigmoidtrain\n"
)
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log = open(logfile, 'w', bufsize)  # open general log file

for num in range(int(p['svm.initial']), fulltrainset.numInstances(),
                 (fulltrainset.numInstances() / int(p['svm.numdatapoints']))):
    trainset = Instances(fulltrainset, 0, num)  # create training set
    trainset.setClassIndex(trainset.numAttributes() - 1)

    filelimit.write(str(num))
    for kerneltype in range(0, 4):
        log.write("---------------------------------\nTraining Set Size: " +
                  str(trainset.numInstances()) + ", Test Set Size: " +
                  str(testset.numInstances()) + ", Full data set size: " +
                  str(fulltrainset.numInstances()) + "\n")
        for dataset in [testset, fulltrainset]:
            algo = LibSVM()
            tag = SelectedTag(
                str(kerneltype), algo.TAGS_KERNELTYPE
            )  # 0 = linear, 1 = polynomial, 2 = radial basis function, 3 = sigmoid
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize = 0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit = open(datafilelimit, 'w', bufsize)
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log = open(logfile, 'w', bufsize)  # open general log file
timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv"
timefile = open(timefilename, 'w', bufsize)
timefile.write("instances,timetest,timetrain\n")

for num in range(int(p['mlp.initial']), fulltrainset.numInstances(),
                 (fulltrainset.numInstances() / int(p['mlp.numdatapoints']))):
    trainset = Instances(fulltrainset, 0, num)  # create training set
    trainset.setClassIndex(trainset.numAttributes() - 1)
    log.write("---------------------------------\nTraining Set Size: " +
              str(trainset.numInstances()) + ", Test Set Size: " +
              str(testset.numInstances()) + ", Full data set size: " +
              str(fulltrainset.numInstances()) + "\n")

    filelimit.write(str(trainset.numInstances()))
    timefile.write(str(num))
    for dataset in [testset, fulltrainset]:
        algo = MultilayerPerceptron()
        algo.setTrainingTime(int(p['mlp.N']))
        x = time.time()
        algo.buildClassifier(trainset)
Esempio n. 5
0
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize=0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit=open(datafilelimit, 'w', bufsize)
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log=open(logfile, 'w', bufsize) # open general log file
timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv"
timefile = open(timefilename, 'w', bufsize)
timefile.write("instances,timetest,timetrain\n")

for num in range(int(p['adaboost.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['adaboost.numdatapoints']))):
   trainset = Instances(fulltrainset,0,num)   # create training set
   trainset.setClassIndex(trainset.numAttributes() - 1)
   log.write("---------------------------------\nTraining Set Size: " + str(trainset.numInstances()) + ", Test Set Size: " + str(testset.numInstances()) + ", Full data set size: " + str(fulltrainset.numInstances()) + "\n")
   filelimit.write(str(num))
   timefile.write(str(num))
   for dataset in [testset, fulltrainset]:
       algo = AdaBoostM1()
       weaklearner = J48()
       algo.setClassifier(weaklearner)
       algo.setNumIterations(int(p['adaboost.iterations']))
       x = time.time()
       algo.buildClassifier(trainset)
       evaluation = Evaluation(trainset)
       timefile.write("," + str(time.time() - x))
       output = PlainText()  # plain text output for predictions
Esempio n. 6
0
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
datafile = FileReader(sys.argv[1])
data = Instances(datafile)
rand = Random()  # seed from the system time
data.randomize(rand)  # randomize data with number generator

# open output files
bufsize = 0
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])

# loop for different amounts of data with fixed test set
datasize = data.numInstances()
limit = (datasize * 2) / 3  # loop until we use 2/3 data as training set
testset = Instances(data, limit, datasize -
                    limit)  # create training set using the last 1/3 of data
testset.setClassIndex(testset.numAttributes() - 1)

saver = ArffSaver()
saver.setInstances(testset)
testsetfile = "./data/split/" + dataname + "-" + "test.arff"
file = File(testsetfile)
saver.setFile(file)
saver.writeBatch()

trainset = Instances(data, 0, limit)  # create training set
saver = ArffSaver()
saver.setInstances(trainset)