def createAttributes(featureVector): numFeatures = len(featureVector) attributes = [Attribute(str(i) + " numeric") for i in range(numFeatures)] attributes.append(Attribute("class", ArrayList(["true", "false"]))) instances = Instances("tests", ArrayList(attributes), 1) instances.setClassIndex(len(attributes) -1) return ArrayList(attributes)
def _getInstances(self, classAttr): # create attributes self.classAttr = classAttr attName2Obj = {} attVector = FastVector() for attName in self.numericAttributes: attr = Attribute(attName) attVector.addElement(attr) attName2Obj[attName] = attr for (attName, domain) in self.attName2Domain.iteritems(): vDomain = FastVector(len(domain)) for v in domain: #print v vDomain.addElement(String(str(v))) attr = Attribute(attName, vDomain) attVector.addElement(attr) attName2Obj[attName] = attr self.attName2Obj = attName2Obj # create Instances object instances = Instances("instances", attVector, len(self.instances)) for i in self.instances: inst = self._makeInstance(i) instances.add(inst) instances.setClass(attName2Obj[classAttr]) return instances
def readFeature(num_features,type,select_feature,numtrees): #filename1=resultFileTest #filename2=resultFileTest2 filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv' filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv' #print filename1 loader=CSVLoader() loader.setSource(File(filename1)) data=loader.getDataSet() #print data.numAttributes() data.setClassIndex(data.numAttributes()-1) rf=RF() rf.setNumTrees(numtrees) rf.buildClassifier(data) #print rf loader.setSource(File(filename2)) test_data=Instances(loader.getDataSet()) test_data.setClassIndex(test_data.numAttributes()-1) ''' num=test_data.numInstances() print num for i in xrange(num): r1=rf.distributionForInstance(test_data.instance(i)) r2=rf.classifyInstance(test_data.instance(i)) ptrixrint r1 print r2''' buffer = StringBuffer() # buffer for the predictions output=PlainText() output.setHeader(test_data) output.setBuffer(buffer) attRange = Range() # attributes to output outputDistribution = Boolean(True) evaluator=Evaluation(data) evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution]) #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)]) #evaluator1=Evaluation(test_data) print evaluator.toSummaryString() print evaluator.toClassDetailsString() print evaluator.toMatrixString() return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
def readDataFromResultsTable(attributes, rt): data = Instances("results", ArrayList(attributes), rt.size()) nrOfFeatures = len(attributes) for i in range(0, rt.size()): inst = DenseInstance(nrOfFeatures) for j in range(0, nrOfFeatures): value = rt.getValue(attributes[j].name(), i) inst.setValue(attributes[j], value) data.add(inst) return data
def cluster(algorithm, filename, options = ''): reader = BufferedReader(FileReader(filename)) data = Instances(reader) reader.close() cl = algorithm() cl.setOptions(options.split()) cl.buildClusterer(data) returnData = [] for instance in data.enumerateInstances(): returnData.append(cl.clusterInstance(instance)) print returnData
def cluster(algorithm, filename, options=''): reader = BufferedReader(FileReader(filename)) data = Instances(reader) reader.close() cl = algorithm() cl.setOptions(options.split()) cl.buildClusterer(data) returnData = [] for instance in data.enumerateInstances(): returnData.append(cl.clusterInstance(instance)) print returnData
def load_arff(self, arff): file = FileReader(arff) #fis = FileInputStream(arff) #file = InputStreamReader(fis, "UTF-8"); #fr = FileReader(arff) #file = BufferedReader(fr) data = Instances(file) data.setClassIndex(data.numAttributes() - 1) return data
def classify(img, classifier, class_names, ops=None, distribution_class_index=-1): """ img: a 2D RandomAccessibleInterval. classifier: a WEKA Classifier instance, like SMO or FastRandomForest, etc. Any. If it's a string, interprets it as a file path and attempts to deserialize a previously saved trained classifier. class_names: the list of names of each class to learn. ops: the filter bank of ImgMath ops for the img. distribution_class_index: defaults to -1, meaning return the class index for each pixel. When larger than -1, it's interpreted as a class index, and returns instead the floating-point value of each pixel in the distribution of that particular class index. """ if type(classifier) == str: classifier = SerializationHelper.read(classifier) ops = ops if ops else filterBank(img) attributes = ArrayList() for i in xrange(len(ops)): attributes.add(Attribute("attr-%i" % i)) #for name in classifier.attributeNames()[0][1]: # attributes.add(Attribute(name)) attributes.add(Attribute("class", class_names)) info = Instances("structure", attributes, 1) info.setClassIndex(len(attributes) -1) opImgs = [compute(op).into(ArrayImgs.floats([img.dimension(0), img.dimension(1)])) for op in ops] cs_opImgs = Views.collapse(Views.stack(opImgs)) result = ArrayImgs.floats([img.dimension(0), img.dimension(1)]) cr = result.cursor() cop = Views.iterable(cs_opImgs).cursor() while cr.hasNext(): tc = cop.next() vector = array((tc.get(i).getRealDouble() for i in xrange(len(opImgs))), 'd') vector += array([0], 'd') di = DenseInstance(1.0, vector) di.setDataset(info) # the list of attributes if distribution_class_index > -1: cr.next().setReal(classifier.distributionForInstance(di)[distribution_class_index]) else: cr.next().setReal(classifier.classifyInstance(di)) return result
def build_instances(state,dataset): class_attributes = ["Sunny", "Fog", "Rain", "Snow", "Hail", "Thunder", "Tornado"] header = ["state","lat", "lon", "day","temp","dewp","weather"] #build attributes based on the header and types attributes = [] for h in header[:-1]: attributes.append(Attribute(h)) #add the classification attribute classification_vector = FastVector(len(class_attributes)) for c in class_attributes: classification_vector.addElement(c) attributes.append(Attribute("toClassify", classification_vector)) fvWekaAttributes = FastVector(len(dataset[0])) for a in attributes: fvWekaAttributes.addElement(a) training_set = Instances("C4.5Set", fvWekaAttributes, len(dataset)) training_set.setClassIndex(len(header)-1) for d in dataset: inst = Instance(len(d)) for i in range(len(d)-1): try: inst.setValue(fvWekaAttributes.elementAt(i), float(d[i])) except: pass #print "failed on", i, d[i], d[i].__class__ inst.setValue(fvWekaAttributes.elementAt(len(d)-1), d[-1]) training_set.add(inst) j48 = J48() j48.buildClassifier(training_set) return state,parse_tree(str(j48))
def createTrainingInstances(matchingExamples, mismatchingExamples): """ Expects the matchingExamples to be a list of feature lists, i.e. the feature vector is a list. """ numFeatures = len(matchingExamples[0]) attributes = [Attribute(str(i) + " numeric") for i in range(numFeatures)] attributes.append(Attribute("class", ArrayList(["true", "false"]))) trainingData = Instances("matches", ArrayList(attributes), len(matchingExamples) + len(mismatchingExamples)) trainingData.setClassIndex(len(attributes) -1) # the last index for f in matchingExamples: trainingData.add(DenseInstance(1.0, f + [1])) # 1 is True for f in mismatchingExamples: trainingData.add(DenseInstance(1.0, f + [0])) # 0 is False return trainingData
def classify(classifier, matches): """ Expects one vector numFeatures length """ """ returns a list of [result, distributionforinstance match]""" attributes = createAttributes(matches[0]) instances = Instances("tests", attributes, 1) instances.setClassIndex(len(attributes) -1) distribution=[] ### for match in matches: instances.add(DenseInstance(1.0, match + [0])) for i in range(len(matches)): result=classifier.classifyInstance(instances.instance(i)) dist=(classifier.distributionForInstance(instances.instance(i))) results=[result, dist[1]] return results
def createTrainingData(img, samples, class_names, n_samples=0, ops=None): """ img: a 2D RandomAccessibleInterval. samples: a sequence of long[] (or int numeric sequence or Localizable) and class_index pairs; can be a generator. n_samples: optional, the number of samples (in case samples is e.g. a generator). class_names: a list of class names, as many as different class_index. ops: optional, the sequence of ImgMath ops to apply to the img, defaults to filterBank(img) return an instance of WEKA Instances """ ops = ops if ops else filterBank(img) if 0 == n_samples: n_samples = len(samples) # Define a WEKA Attribute for each feature (one for op in the filter bank, plus the class) attribute_names = ["attr-%i" % (i+1) for i in xrange(len(ops))] attributes = ArrayList() for name in attribute_names: attributes.add(Attribute(name)) # Add an attribute at the end for the classification classes attributes.add(Attribute("class", class_names)) # Create the training data structure training_data = Instances("training", attributes, n_samples) training_data.setClassIndex(len(attributes) -1) opImgs = [compute(op).into(ArrayImgs.floats([img.dimension(0), img.dimension(1)])) for op in ops] ra = Views.collapse(Views.stack(opImgs)).randomAccess() for position, class_index in samples: ra.setPosition(position) tc = ra.get() vector = array((tc.get(i).getRealDouble() for i in xrange(len(opImgs))), 'd') vector += array([class_index], 'd') training_data.add(DenseInstance(1.0, vector)) return training_data
def runClassifierAlgo(algo, training_filename, test_filename, do_model, do_eval, do_predict): """ Run classifier algorithm <algo> on training data in <training_filename> to build a model then run in on data in <test_filename> (equivalent of WEKA "Supplied test set") """ training_file = FileReader(training_filename) training_data = Instances(training_file) test_file = FileReader(test_filename) test_data = Instances(test_file) # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) if verbose: if do_model: print "--> Generated model:\n" print algo.toString() if do_eval: print "--> Evaluation:\n" print evaluation.toSummaryString() if do_predict: print "--> Predictions:\n" print buffer return {"model": str(algo), "eval": str(evaluation.toSummaryString()), "predict": str(buffer)}
""" Commandline parameter(s): first parameter must be the ARFF file """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." datafile = FileReader(sys.argv[1] + ".arff") data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize=0 datafile = "data/plot/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_rmse.csv" file=open(datafile, 'w', bufsize) file.write("cf,rmse\n") logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log" log=open(logfile, 'w', bufsize) # open general log file
def buildClassifier(self, instances): """ builds the ZeroR classifier with the given data Parameter(s): 'instances' -- the data to build the classifier from """ self.getCapabilities().testWithFail(instances) # remove instances with missing class instances = Instances(instances) instances.deleteWithMissingClass() sumOfWeights = 0 self.__Class = instances.classAttribute() self.__ClassValue = 0 self.__Counts = None if (instances.classAttribute().isNumeric()): self.__Counts = None elif (instances.classAttribute().isNominal()): self.__Counts = jarray.zeros(instances.numClasses(), 'd') for i in range(len(self.__Counts)): self.__Counts[i] = 1 sumOfWeights = instances.numClasses() enu = instances.enumerateInstances() while (enu.hasMoreElements()): instance = enu.nextElement() if (not instance.classIsMissing()): if (instances.classAttribute().isNominal()): self.__Counts[int( instance.classValue())] += instance.weight() else: self.__ClassValue += instance.weight( ) * instance.classValue() sumOfWeights += instance.weight() if (instances.classAttribute().isNumeric()): if (Utils.gr(sumOfWeights, 0)): self.__ClassValue /= sumOfWeights else: self.__ClassValue = Utils.maxIndex(self.__Counts) Utils.normalize(self.__Counts, sumOfWeights) return
print "Usage: supervised.py <ARFF-file> <crossvalidate>" sys.exit() crossvalidate = sys.argv[2] rand = Random() # seed from the system time # load properties p = Properties() p.load(open('./ml.properties')) # load data file print "Loading data..." trainfile = FileReader(sys.argv[1] + "-train.arff") print "Loading " + sys.argv[1] + "-train.arff" testfile = FileReader(sys.argv[1] + "-test.arff") print "Loading " + sys.argv[1] + "-test.arff" fulltrainset = Instances(trainfile) fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1) testset = Instances(testfile) testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize=0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit=open(datafilelimit, 'w', bufsize) filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log=open(logfile, 'w', bufsize) # open general log file timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv" timefile = open(timefilename, 'w', bufsize)
first parameter must be the ARFF file one wants to process with J48 Note: needs Weka 3.7.x to run (due to changes in the weka.classifiers.Evaluation class) """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: UsingJ48Ext.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) output = PlainText() # plain text output for predictions output.setHeader(data) buffer = StringBuffer() # buffer to use output.setBuffer(buffer) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [output, attRange, outputDistribution])
print "Usage: supervised.py <ARFF-file> <Validation>" sys.exit() crossvalidate = sys.argv[2] rand = Random() # seed from the system time # load properties p = Properties() p.load(open('./ml.properties')) # load data file print "Loading data..." trainfile = FileReader(sys.argv[1] + "-train.arff") print "Loading " + sys.argv[1] + "-train.arff" testfile = FileReader(sys.argv[1] + "-test.arff") print "Loading " + sys.argv[1] + "-test.arff" fulltrainset = Instances(trainfile) fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1) testset = Instances(testfile) testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize=0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit=open(datafilelimit, 'w', bufsize) filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n") logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log=open(logfile, 'w', bufsize) # open general log file for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))):
def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict): """ If <test_filename> Run classifier algorithm <algo> on training data in <training_filename> to build a model then test on data in <test_filename> (equivalent of Weka "Supplied test set") else do 10 fold CV lassifier algorithm <algo> on data in <training_filename> <class_index> is the column containing the dependent variable http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html """ print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename misc.checkExists(training_filename) training_file = FileReader(training_filename) training_data = Instances(training_file) if test_filename: test_file = FileReader(test_filename) test_data = Instances(test_file) else: test_data = training_data # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model if test_filename: algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution if test_filename: evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) else: # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')]) # print evaluation.toSummaryString() rand = Random(1) evaluation.crossValidateModel(algo, training_data, 4, rand) if False: print 'percentage correct =', evaluation.pctCorrect() print 'area under ROC =', evaluation.areaUnderROC(class_index) confusion_matrix = evaluation.confusionMatrix() for l in confusion_matrix: print '** ', ','.join('%2d'%int(x) for x in l) if verbose: if do_model: print '--> Generated model:\n' print algo.toString() if do_eval: print '--> Evaluation:\n' print evaluation.toSummaryString() if do_predict: print '--> Predictions:\n' print buffer return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
first parameter must be the ARFF file one wants to process with J48 Note: needs Weka 3.6.x to run (due to changes in the weka.classifiers.Evaluation class) """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: UsingJ48Ext.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [buffer, attRange, outputDistribution]) # print out the built model print "--> Generated model:\n"
import weka.core.Instances as Instances import weka.classifiers.trees.J48 as J48 import weka.classifiers.Evaluation as Evaluation import weka.core.Range as Range import weka.classifiers.functions.MultilayerPerceptron as MultilayerPerceptron import weka.core.SerializationHelper as SerializationHelper # check commandline parameters if (not (len(sys.argv) == 3)): print "Usage: weka.py <ARFF-file>" sys.exit() file = FileReader(sys.argv[1]) file2 = FileReader(sys.argv[2]) data = Instances(file) test = Instances(file2) data.setClassIndex(data.numAttributes() - 1) test.setClassIndex(test.numAttributes() - 1) evaluation = Evaluation(data) buffer = StringBuffer() attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution nn = MultilayerPerceptron() nn.buildClassifier(data) # only a trained classifier can be evaluated #print evaluation.evaluateModel(nn, ['-t', sys.argv[1], '-T', sys.argv[2]])#;, [buffer, attRange, outputDistribution]) res = evaluation.evaluateModel(nn, test, [buffer, attRange, outputDistribution]) f = open('predictions/' + data.relationName(), 'w') for d in res:
""" Commandline parameter(s): first parameter must be the ARFF file """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." datafile = FileReader(sys.argv[1]) data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize=0 dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) # loop for different amounts of data with fixed test set datasize = data.numInstances() limit = (datasize*2)/3 # loop until we use 2/3 data as training set testset = Instances(data,limit,datasize-limit) # create training set using the last 1/3 of data testset.setClassIndex(testset.numAttributes() - 1) saver = ArffSaver() saver.setInstances(testset)
""" Commandline parameter(s): first parameter must be the ARFF file """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." datafile = FileReader(sys.argv[1] + ".arff") data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize = 0 datafile = "data/plot/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_rmse.csv" file = open(datafile, 'w', bufsize) # open a file for rmse data file.write("epochs,le,lm,kd,ball,cover\n") logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \ str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log" log = open(logfile, 'w', bufsize) # open general log file
percentiles = [] for i in range(0, 100, step_size): index = int(n * i / 100) percentiles.append(thread_array[index]) # 0-99 percentile percentiles.append(thread_array[n - 1]) # 100th percentile f = open(''.join([directory, 'percentiles.txt']), 'w') f.write(str(percentiles)) f.close() #print str(percentiles) ## Data Distribution testing/Training data = FileReader(data_file) data = Instances(data) data = Instances(data, 0, n - (n % folds)) n = n - (n % folds) print data.numInstances() len_fold = int(math.floor(n / folds)) folds_test = [] folds_train = [] for i in range(0, n + 1, len_fold)[:-1]: folds_test.append(Instances(data, i, len_fold)) f = open( ''.join([ directory, ''.join(['fold_test_', str(i / len_fold), '.arff']) ]), "w") f.write(str(folds_test[-1])) f.close()
print "Usage: supervised.py <ARFF-file>" sys.exit() crossvalidate = sys.argv[2] rand = Random() # seed from the system time # load properties p = Properties() p.load(open('./ml.properties')) # load data file print "Loading data..." trainfile = FileReader(sys.argv[1] + "-train.arff") print "Loading " + sys.argv[1] + "-train.arff" testfile = FileReader(sys.argv[1] + "-test.arff") print "Loading " + sys.argv[1] + "-test.arff" fulltrainset = Instances(trainfile) fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1) testset = Instances(testfile) testset.setClassIndex(testset.numAttributes() - 1) # open output files bufsize = 0 classifiername = str(os.path.splitext(os.path.basename(__file__))[0]) dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv" filelimit = open(datafilelimit, 'w', bufsize) filelimit.write( "instances,lineartest,lineartrain,polytest,polytrain,radialtest,radialtrain,sigmoidtest,sigmoidtrain\n" ) logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log" log = open(logfile, 'w', bufsize) # open general log file
import weka.core.Instances as Instances import weka.classifiers.trees.J48 as J48 import weka.classifiers.Evaluation as Evaluation import weka.core.Range as Range import weka.classifiers.functions.MultilayerPerceptron as MultilayerPerceptron import weka.core.SerializationHelper as SerializationHelper # check commandline parameters if (not (len(sys.argv) == 3)): print "Usage: weka.py <ARFF-file>" sys.exit() file = FileReader(sys.argv[1]) file2 = FileReader(sys.argv[2]) data = Instances(file) test = Instances(file2) data.setClassIndex(data.numAttributes() - 1) test.setClassIndex(test.numAttributes() - 1) evaluation = Evaluation(data) buffer = StringBuffer() attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution nn = MultilayerPerceptron() nn.buildClassifier(data) # only a trained classifier can be evaluated #print evaluation.evaluateModel(nn, ['-t', sys.argv[1], '-T', sys.argv[2]])#;, [buffer, attRange, outputDistribution]) res = evaluation.evaluateModel(nn, test, [buffer, attRange, outputDistribution]) f = open('predictions/' + data.relationName(), 'w') for d in res: f.write(str(d) + '\n');
# Define a WEKA Attribute for each feature (each op, 14 total plus the class) attribute_names = ["attr-%i" % (i + 1) for i in xrange(14)] attributes = ArrayList() for name in attribute_names: attributes.add(Attribute(name)) # Add an attribute at the end for the classification classes attributes.add( Attribute("class", ["membrane", "mit-boundary", "mit-inside", "cytoplasm"])) # Create the training data structure # which consists of 16 samples for each membrane training image rotation # and 4 samples for each mitochondrial boundary image rotation # and times 2 to then add examples of the other, non-membrane class training_data = Instances( "training", attributes, (len(synth_imgs_membrane) * 16 + len(synth_imgs_mit_boundary) * 4) * 2) training_data.setClassIndex(len(attributes) - 1) def populateInstances(instances, synth_imgs, class_index, mins, maxs): # Populate the training data: create the filter bank for each feature image # by reading values from the interval defined by mins and maxs target = ArrayImgs.floats([width, height]) interval = FinalInterval(mins, maxs) n_samples = Intervals.numElements(interval) for img in synth_imgs: vectors = [zeros(len(attributes), 'd') for _ in xrange(n_samples)] for k, op in enumerate(filterBank(img, sumType=DoubleType())): imgOp = compute(op).into(target) for i, v in enumerate(Views.interval(imgOp, interval)):
def buildClassifier(self, instances): """ builds the ZeroR classifier with the given data Parameter(s): 'instances' -- the data to build the classifier from """ self.getCapabilities().testWithFail(instances) # remove instances with missing class instances = Instances(instances) instances.deleteWithMissingClass() sumOfWeights = 0 self.__Class = instances.classAttribute() self.__ClassValue = 0 self.__Counts = None if (instances.classAttribute().isNumeric()): self.__Counts = None elif (instances.classAttribute().isNominal()): self.__Counts = jarray.zeros(instances.numClasses(), 'd') for i in range(len(self.__Counts)): self.__Counts[i] = 1 sumOfWeights = instances.numClasses() enu = instances.enumerateInstances() while (enu.hasMoreElements()): instance = enu.nextElement() if (not instance.classIsMissing()): if (instances.classAttribute().isNominal()): self.__Counts[int(instance.classValue())] += instance.weight() else: self.__ClassValue += instance.weight() * instance.classValue() sumOfWeights += instance.weight() if (instances.classAttribute().isNumeric()): if (Utils.gr(sumOfWeights, 0)): self.__ClassValue /= sumOfWeights else: self.__ClassValue = Utils.maxIndex(self.__Counts) Utils.normalize(self.__Counts, sumOfWeights) return
elif o in ("-o","--outputArff"): outputFn = a numReqOpt = numReqOpt + 1 else: assert False, "unhandled option" if (numReqOpt < 2): usage() return 1 options = {'idFlag':True, 'weightFlag': False, 'rmClassFlag': False, 'rmClass': 0} # read the first dataset fn = inputList[0] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove class labels from dataset attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) newData = Filter.useFilter(Data, attributeremove) # loop over input arff file cnt = Data.numAttributes() for fnCnt in range(1,len(inputList)): fn = inputList[fnCnt] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label
""" Commandline parameter(s): first parameter must be the ARFF file """ # check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: supervised.py <ARFF-file>" sys.exit() # load data file print "Loading data..." datafile = FileReader(sys.argv[1]) data = Instances(datafile) rand = Random() # seed from the system time data.randomize(rand) # randomize data with number generator # open output files bufsize = 0 dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) # loop for different amounts of data with fixed test set datasize = data.numInstances() limit = (datasize * 2) / 3 # loop until we use 2/3 data as training set testset = Instances(data, limit, datasize - limit) # create training set using the last 1/3 of data testset.setClassIndex(testset.numAttributes() - 1) saver = ArffSaver()
## Data Distribution testing/Training data = FileReader(data_file) data = Instances(data) data = Instances(data , 0 , n - (n % folds) ) n = n- (n % folds) print data.numInstances() len_fold = int(math.floor(n/folds)) folds_test = [] folds_train = [] for i in range(0,n+1,len_fold)[:-1]: folds_test.append(Instances(data,i,len_fold)) f = open(''.join([directory , ''.join(['fold_test_' , str(i/len_fold) , '.arff'])]) , "w") f.write(str(folds_test[-1])); f.close() temp = Instances(data, 0 , n) for j in range(i,i+len_fold,1): temp.delete(i) folds_train.append(temp) f = open(''.join([directory , ''.join(['fold_train_' , str(i/len_fold) , '.arff'])]) , "w") f.write(str(folds_train[-1])); f.close() ## Prediction buffers = [] ## List of per fold predictions weights = [] ## List of per fold weights per attribute for fld in range(0,folds): train = folds_train[fld]