Beispiel #1
0
def createAttributes(featureVector):
  numFeatures = len(featureVector)
  attributes = [Attribute(str(i) + " numeric") for i in range(numFeatures)]
  attributes.append(Attribute("class", ArrayList(["true", "false"])))
  instances = Instances("tests", ArrayList(attributes), 1)
  instances.setClassIndex(len(attributes) -1)
  return ArrayList(attributes)
Beispiel #2
0
	def _getInstances(self, classAttr):
		# create attributes
		self.classAttr = classAttr
		attName2Obj = {}
		attVector = FastVector()
		for attName in self.numericAttributes:
			attr = Attribute(attName)
			attVector.addElement(attr)
			attName2Obj[attName] = attr
		for (attName, domain) in self.attName2Domain.iteritems():
			vDomain = FastVector(len(domain))
			for v in domain:
				#print v
				vDomain.addElement(String(str(v)))
			attr = Attribute(attName, vDomain)
			attVector.addElement(attr)
			attName2Obj[attName] = attr
		self.attName2Obj = attName2Obj
		
		# create Instances object
		instances = Instances("instances", attVector, len(self.instances))
		for i in self.instances:
			inst = self._makeInstance(i)
			instances.add(inst)
			
		instances.setClass(attName2Obj[classAttr])
		return instances
def readFeature(num_features,type,select_feature,numtrees):
    #filename1=resultFileTest
    #filename2=resultFileTest2
    filename1=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_train.csv'
    filename2=resultFile+'_'+type+'_'+num_features+'_'+select_feature+'_test.csv'
    #print filename1
    loader=CSVLoader()
    loader.setSource(File(filename1))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    
    rf.buildClassifier(data)
   
    #print rf
    loader.setSource(File(filename2))
    

    test_data=Instances(loader.getDataSet())
    
    test_data.setClassIndex(test_data.numAttributes()-1)

    
    ''' num=test_data.numInstances()

    
    print num
   
    for i in xrange(num):

        r1=rf.distributionForInstance(test_data.instance(i))
  
        r2=rf.classifyInstance(test_data.instance(i))

        ptrixrint r1 
          
           print r2'''
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(test_data)
    output.setBuffer(buffer)
    
    attRange = Range()  # attributes to output
    outputDistribution = Boolean(True)
    evaluator=Evaluation(data)
    evaluator.evaluateModel(rf,test_data,[output,attRange,outputDistribution])
    #print evaluator.evaluateModel(RF(),['-t',filename1,'-T',filename2,'-I',str(numtrees)])
    #evaluator1=Evaluation(test_data)
    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.precision(1),evaluator.recall(1),evaluator.fMeasure(1),evaluator.matthewsCorrelationCoefficient(1),evaluator.numTruePositives(1),evaluator.numFalsePositives(1),evaluator.numTrueNegatives(1),evaluator.numFalseNegatives(1),evaluator.areaUnderROC(1)]
Beispiel #4
0
def readDataFromResultsTable(attributes, rt):
    data = Instances("results", ArrayList(attributes), rt.size())
    nrOfFeatures = len(attributes)
    for i in range(0, rt.size()):
        inst = DenseInstance(nrOfFeatures)
        for j in range(0, nrOfFeatures):
            value = rt.getValue(attributes[j].name(), i)
            inst.setValue(attributes[j], value)
            data.add(inst)
    return data
Beispiel #5
0
def cluster(algorithm, filename, options = ''):
    reader = BufferedReader(FileReader(filename))
    data = Instances(reader)
    reader.close()
    cl = algorithm()
    cl.setOptions(options.split())
    cl.buildClusterer(data)
    returnData = []
    for instance in data.enumerateInstances(): returnData.append(cl.clusterInstance(instance))
    print returnData
Beispiel #6
0
def cluster(algorithm, filename, options=''):
    reader = BufferedReader(FileReader(filename))
    data = Instances(reader)
    reader.close()
    cl = algorithm()
    cl.setOptions(options.split())
    cl.buildClusterer(data)
    returnData = []
    for instance in data.enumerateInstances():
        returnData.append(cl.clusterInstance(instance))
    print returnData
    def load_arff(self, arff):
        file = FileReader(arff)

        #fis = FileInputStream(arff)
        #file = InputStreamReader(fis, "UTF-8");

        #fr = FileReader(arff)
        #file = BufferedReader(fr)
        
        data = Instances(file)
        data.setClassIndex(data.numAttributes() - 1)
        return data
Beispiel #8
0
def classify(img, classifier, class_names, ops=None, distribution_class_index=-1):
  """ img: a 2D RandomAccessibleInterval.
      classifier: a WEKA Classifier instance, like SMO or FastRandomForest, etc. Any.
                  If it's a string, interprets it as a file path and attempts to deserialize
                  a previously saved trained classifier.
      class_names: the list of names of each class to learn.
      ops: the filter bank of ImgMath ops for the img.
      distribution_class_index: defaults to -1, meaning return the class index for each pixel.
                                When larger than -1, it's interpreted as a class index, and
                                returns instead the floating-point value of each pixel in
                                the distribution of that particular class index. """
  if type(classifier) == str:
    classifier = SerializationHelper.read(classifier)

  ops = ops if ops else filterBank(img)
  
  attributes = ArrayList()
  for i in xrange(len(ops)):
    attributes.add(Attribute("attr-%i" % i))
  #for name in classifier.attributeNames()[0][1]:
  #  attributes.add(Attribute(name))
  attributes.add(Attribute("class", class_names))
  
  info = Instances("structure", attributes, 1)
  info.setClassIndex(len(attributes) -1)

  opImgs = [compute(op).into(ArrayImgs.floats([img.dimension(0), img.dimension(1)])) for op in ops]
  cs_opImgs = Views.collapse(Views.stack(opImgs))

  result = ArrayImgs.floats([img.dimension(0), img.dimension(1)])
  cr = result.cursor()
  cop = Views.iterable(cs_opImgs).cursor()

  while cr.hasNext():
    tc = cop.next()
    vector = array((tc.get(i).getRealDouble() for i in xrange(len(opImgs))), 'd')
    vector += array([0], 'd')
    di = DenseInstance(1.0, vector)
    di.setDataset(info) # the list of attributes
    if distribution_class_index > -1:
      cr.next().setReal(classifier.distributionForInstance(di)[distribution_class_index])
    else:
      cr.next().setReal(classifier.classifyInstance(di))

  return result
Beispiel #9
0
    def _getInstances(self, classAttr):
        # create attributes
        self.classAttr = classAttr
        attName2Obj = {}
        attVector = FastVector()
        for attName in self.numericAttributes:
            attr = Attribute(attName)
            attVector.addElement(attr)
            attName2Obj[attName] = attr
        for (attName, domain) in self.attName2Domain.iteritems():
            vDomain = FastVector(len(domain))
            for v in domain:
                #print v
                vDomain.addElement(String(str(v)))
            attr = Attribute(attName, vDomain)
            attVector.addElement(attr)
            attName2Obj[attName] = attr
        self.attName2Obj = attName2Obj

        # create Instances object
        instances = Instances("instances", attVector, len(self.instances))
        for i in self.instances:
            inst = self._makeInstance(i)
            instances.add(inst)

        instances.setClass(attName2Obj[classAttr])
        return instances
def build_instances(state,dataset):
    class_attributes = ["Sunny", "Fog", "Rain", "Snow", "Hail", "Thunder", "Tornado"]
    header = ["state","lat", "lon", "day","temp","dewp","weather"]

    #build attributes based on the header and types
    attributes = []
    for h in header[:-1]:
        attributes.append(Attribute(h))

    #add the classification attribute
    classification_vector = FastVector(len(class_attributes))
    for c in class_attributes:
        classification_vector.addElement(c)
    attributes.append(Attribute("toClassify", classification_vector))

    fvWekaAttributes = FastVector(len(dataset[0]))

    for a in attributes:
        fvWekaAttributes.addElement(a)
    
    training_set = Instances("C4.5Set", fvWekaAttributes, len(dataset))
    training_set.setClassIndex(len(header)-1)

    for d in dataset:
        inst = Instance(len(d))
        for i in range(len(d)-1):
            try:
                inst.setValue(fvWekaAttributes.elementAt(i), float(d[i]))
            except:
                pass
                #print "failed on", i, d[i], d[i].__class__
        inst.setValue(fvWekaAttributes.elementAt(len(d)-1), d[-1])
        
        training_set.add(inst)


    j48 = J48()
    j48.buildClassifier(training_set)
    return state,parse_tree(str(j48))
Beispiel #11
0
def createTrainingInstances(matchingExamples, mismatchingExamples):
  """ Expects the matchingExamples to be a list of feature lists,
      i.e. the feature vector is a list. """
  numFeatures = len(matchingExamples[0])
  attributes = [Attribute(str(i) + " numeric") for i in range(numFeatures)]
  attributes.append(Attribute("class", ArrayList(["true", "false"])))
  trainingData = Instances("matches", ArrayList(attributes), len(matchingExamples) + len(mismatchingExamples))
  trainingData.setClassIndex(len(attributes) -1) # the last index
  for f in matchingExamples:
    trainingData.add(DenseInstance(1.0, f + [1])) # 1 is True
  for f in mismatchingExamples:
    trainingData.add(DenseInstance(1.0, f + [0])) # 0 is False
  return trainingData
Beispiel #12
0
def classify(classifier, matches):
  """ Expects one vector numFeatures length """
  """ returns a list of [result, distributionforinstance match]"""
  attributes = createAttributes(matches[0])
  instances = Instances("tests", attributes, 1)
  instances.setClassIndex(len(attributes) -1)
  distribution=[] ###
  for match in matches:
    instances.add(DenseInstance(1.0, match + [0]))
  for i in range(len(matches)):
    result=classifier.classifyInstance(instances.instance(i))
    dist=(classifier.distributionForInstance(instances.instance(i)))
    results=[result, dist[1]]
  return results


  


  
Beispiel #13
0
def createTrainingData(img, samples, class_names, n_samples=0, ops=None):
  """ img: a 2D RandomAccessibleInterval.
      samples: a sequence of long[] (or int numeric sequence or Localizable) and class_index pairs; can be a generator.
      n_samples: optional, the number of samples (in case samples is e.g. a generator).
      class_names: a list of class names, as many as different class_index.
      ops: optional, the sequence of ImgMath ops to apply to the img, defaults to filterBank(img)

      return an instance of WEKA Instances
  """
  ops = ops if ops else filterBank(img)

  if 0 == n_samples:
    n_samples = len(samples)
  
  # Define a WEKA Attribute for each feature (one for op in the filter bank, plus the class)
  attribute_names = ["attr-%i" % (i+1) for i in xrange(len(ops))]
  attributes = ArrayList()
  for name in attribute_names:
    attributes.add(Attribute(name))
  # Add an attribute at the end for the classification classes
  attributes.add(Attribute("class", class_names))

  # Create the training data structure
  training_data = Instances("training", attributes, n_samples)
  training_data.setClassIndex(len(attributes) -1)

  opImgs = [compute(op).into(ArrayImgs.floats([img.dimension(0), img.dimension(1)])) for op in ops]
  ra = Views.collapse(Views.stack(opImgs)).randomAccess()

  for position, class_index in samples:
    ra.setPosition(position)
    tc = ra.get()
    vector = array((tc.get(i).getRealDouble() for i in xrange(len(opImgs))), 'd')
    vector += array([class_index], 'd')
    training_data.add(DenseInstance(1.0, vector))

  return training_data
def runClassifierAlgo(algo, training_filename, test_filename, do_model, do_eval, do_predict):
    """ Run classifier algorithm <algo> on training data in <training_filename> to build a model
        then run in on data in <test_filename> (equivalent of WEKA "Supplied test set") """
    training_file = FileReader(training_filename)
    training_data = Instances(training_file)
    test_file = FileReader(test_filename)
    test_data = Instances(test_file)

    # set the class Index - the index of the dependent variable
    training_data.setClassIndex(class_index)
    test_data.setClassIndex(class_index)

    # create the model
    algo.buildClassifier(training_data)

    evaluation = None
    # only a trained classifier can be evaluated
    if do_eval or do_predict:
        evaluation = Evaluation(test_data)
        buffer = StringBuffer()  # buffer for the predictions
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution])

    if verbose:
        if do_model:
            print "--> Generated model:\n"
            print algo.toString()
        if do_eval:
            print "--> Evaluation:\n"
            print evaluation.toSummaryString()
        if do_predict:
            print "--> Predictions:\n"
            print buffer

    return {"model": str(algo), "eval": str(evaluation.toSummaryString()), "predict": str(buffer)}
"""
Commandline parameter(s):

    first parameter must be the ARFF file

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
datafile = FileReader(sys.argv[1] + ".arff")
data = Instances(datafile)
rand = Random()              # seed from the system time
data.randomize(rand)         # randomize data with number generator

# open output files
bufsize=0

datafile = "data/plot/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_rmse.csv"
file=open(datafile, 'w', bufsize)
file.write("cf,rmse\n")

logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log"
log=open(logfile, 'w', bufsize) # open general log file
Beispiel #16
0
    def buildClassifier(self, instances):
        """
        builds the ZeroR classifier with the given data
        
        Parameter(s):
        
            'instances' -- the data to build the classifier from
        """

        self.getCapabilities().testWithFail(instances)

        # remove instances with missing class
        instances = Instances(instances)
        instances.deleteWithMissingClass()

        sumOfWeights = 0
        self.__Class = instances.classAttribute()
        self.__ClassValue = 0
        self.__Counts = None

        if (instances.classAttribute().isNumeric()):
            self.__Counts = None
        elif (instances.classAttribute().isNominal()):
            self.__Counts = jarray.zeros(instances.numClasses(), 'd')
            for i in range(len(self.__Counts)):
                self.__Counts[i] = 1
            sumOfWeights = instances.numClasses()

        enu = instances.enumerateInstances()
        while (enu.hasMoreElements()):
            instance = enu.nextElement()
            if (not instance.classIsMissing()):
                if (instances.classAttribute().isNominal()):
                    self.__Counts[int(
                        instance.classValue())] += instance.weight()
                else:
                    self.__ClassValue += instance.weight(
                    ) * instance.classValue()
                sumOfWeights += instance.weight()

        if (instances.classAttribute().isNumeric()):
            if (Utils.gr(sumOfWeights, 0)):
                self.__ClassValue /= sumOfWeights
        else:
            self.__ClassValue = Utils.maxIndex(self.__Counts)
            Utils.normalize(self.__Counts, sumOfWeights)

        return
Beispiel #17
0
    print "Usage: supervised.py <ARFF-file> <crossvalidate>"
    sys.exit()
crossvalidate = sys.argv[2]
rand = Random()              # seed from the system time

# load properties
p = Properties()
p.load(open('./ml.properties'))

# load data file
print "Loading data..."
trainfile = FileReader(sys.argv[1] + "-train.arff")
print "Loading " + sys.argv[1] + "-train.arff"
testfile = FileReader(sys.argv[1] + "-test.arff")
print "Loading " + sys.argv[1] + "-test.arff"
fulltrainset = Instances(trainfile)
fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1)
testset = Instances(testfile)
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize=0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit=open(datafilelimit, 'w', bufsize)
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log=open(logfile, 'w', bufsize) # open general log file
timefilename = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_traintime.csv"
timefile = open(timefilename, 'w', bufsize)
Beispiel #18
0
    first parameter must be the ARFF file one wants to process with J48

Note: needs Weka 3.7.x to run (due to changes in the
      weka.classifiers.Evaluation class)

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: UsingJ48Ext.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# create the model
evaluation = Evaluation(data)
output = PlainText()  # plain text output for predictions
output.setHeader(data)
buffer = StringBuffer()  # buffer to use
output.setBuffer(buffer)
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
j48 = J48()
j48.buildClassifier(data)  # only a trained classifier can be evaluated
evaluation.evaluateModel(j48, data, [output, attRange, outputDistribution])
Beispiel #19
0
    print "Usage: supervised.py <ARFF-file> <Validation>"
    sys.exit()
crossvalidate = sys.argv[2]
rand = Random()              # seed from the system time

# load properties
p = Properties()
p.load(open('./ml.properties'))

# load data file
print "Loading data..."
trainfile = FileReader(sys.argv[1] + "-train.arff")
print "Loading " + sys.argv[1] + "-train.arff"
testfile = FileReader(sys.argv[1] + "-test.arff")
print "Loading " + sys.argv[1] + "-test.arff"
fulltrainset = Instances(trainfile)
fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1)
testset = Instances(testfile)
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize=0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit=open(datafilelimit, 'w', bufsize)
filelimit.write("instances,pctincorrecttest,pctincorrecttrain\n")
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log=open(logfile, 'w', bufsize) # open general log file

for num in range(int(p['j48.initial']),fulltrainset.numInstances(),(fulltrainset.numInstances() / int(p['j48.numdatapoints']))):
def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict):
    """ If <test_filename>
            Run classifier algorithm <algo> on training data in <training_filename> to build a model
            then test on data in <test_filename> (equivalent of Weka "Supplied test set") 
        else
            do 10 fold CV lassifier algorithm <algo> on data in <training_filename>
        
        <class_index> is the column containing the dependent variable 
        
        http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually
        http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html
    """
    print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename
    misc.checkExists(training_filename)

    training_file = FileReader(training_filename)
    training_data = Instances(training_file)
    if test_filename:
        test_file = FileReader(test_filename)
        test_data = Instances(test_file)
    else:
        test_data = training_data

   # set the class Index - the index of the dependent variable
    training_data.setClassIndex(class_index)
    test_data.setClassIndex(class_index)

    # create the model
    if test_filename:
        algo.buildClassifier(training_data)

    evaluation = None
    # only a trained classifier can be evaluated
    if do_eval or do_predict:
        evaluation = Evaluation(test_data)
        buffer = StringBuffer()             # buffer for the predictions
        attRange = Range()                  # no additional attributes output
        outputDistribution = Boolean(False) # we don't want distribution
        if test_filename:
            evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution])
        else:
           # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')])
           # print evaluation.toSummaryString()
            rand = Random(1)
            evaluation.crossValidateModel(algo, training_data, 4, rand)
            if False:
                print 'percentage correct =', evaluation.pctCorrect()
                print 'area under ROC =', evaluation.areaUnderROC(class_index)
                confusion_matrix = evaluation.confusionMatrix()
                for l in confusion_matrix:
                    print '** ', ','.join('%2d'%int(x) for x in l)

    if verbose:
        if do_model:
            print '--> Generated model:\n'
            print algo.toString()
        if do_eval:
            print '--> Evaluation:\n'
            print evaluation.toSummaryString()
        if do_predict:
            print '--> Predictions:\n'
            print buffer

    return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
Beispiel #21
0
    first parameter must be the ARFF file one wants to process with J48

Note: needs Weka 3.6.x to run (due to changes in the 
      weka.classifiers.Evaluation class)

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: UsingJ48Ext.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# create the model
evaluation = Evaluation(data)
buffer = StringBuffer()  # buffer for the predictions
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
j48 = J48()
j48.buildClassifier(data)  # only a trained classifier can be evaluated
evaluation.evaluateModel(j48, data, [buffer, attRange, outputDistribution])

# print out the built model
print "--> Generated model:\n"
Beispiel #22
0
import weka.core.Instances as Instances
import weka.classifiers.trees.J48 as J48
import weka.classifiers.Evaluation as Evaluation
import weka.core.Range as Range
import weka.classifiers.functions.MultilayerPerceptron as MultilayerPerceptron
import weka.core.SerializationHelper as SerializationHelper

# check commandline parameters
if (not (len(sys.argv) == 3)):
    print "Usage: weka.py <ARFF-file>"
    sys.exit()

file = FileReader(sys.argv[1])
file2 = FileReader(sys.argv[2])
data = Instances(file)
test = Instances(file2)
data.setClassIndex(data.numAttributes() - 1)
test.setClassIndex(test.numAttributes() - 1)
evaluation = Evaluation(data)
buffer = StringBuffer()
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
nn = MultilayerPerceptron()
nn.buildClassifier(data)  # only a trained classifier can be evaluated

#print evaluation.evaluateModel(nn, ['-t', sys.argv[1], '-T', sys.argv[2]])#;, [buffer, attRange, outputDistribution])
res = evaluation.evaluateModel(nn, test,
                               [buffer, attRange, outputDistribution])
f = open('predictions/' + data.relationName(), 'w')
for d in res:
Beispiel #23
0
"""
Commandline parameter(s):

    first parameter must be the ARFF file

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
datafile = FileReader(sys.argv[1])
data = Instances(datafile)
rand = Random()              # seed from the system time
data.randomize(rand)         # randomize data with number generator

# open output files
bufsize=0
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])

# loop for different amounts of data with fixed test set
datasize = data.numInstances()
limit = (datasize*2)/3   # loop until we use 2/3 data as training set
testset = Instances(data,limit,datasize-limit)   # create training set using the last 1/3 of data
testset.setClassIndex(testset.numAttributes() - 1)

saver = ArffSaver()
saver.setInstances(testset)
"""
Commandline parameter(s):

    first parameter must be the ARFF file

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
datafile = FileReader(sys.argv[1] + ".arff")
data = Instances(datafile)
rand = Random()  # seed from the system time
data.randomize(rand)  # randomize data with number generator

# open output files
bufsize = 0

datafile = "data/plot/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_rmse.csv"
file = open(datafile, 'w', bufsize)  # open a file for rmse data
file.write("epochs,le,lm,kd,ball,cover\n")

logfile = "logs/" + str(os.path.splitext(os.path.basename(__file__))[0]) + "_" + \
   str(os.path.splitext(os.path.basename(sys.argv[1]))[0]) + "_tunable.log"
log = open(logfile, 'w', bufsize)  # open general log file
Beispiel #25
0
    percentiles = []

    for i in range(0, 100, step_size):
        index = int(n * i / 100)
        percentiles.append(thread_array[index])  # 0-99 percentile
    percentiles.append(thread_array[n - 1])  # 100th percentile

    f = open(''.join([directory, 'percentiles.txt']), 'w')
    f.write(str(percentiles))
    f.close()

    #print str(percentiles)

    ## Data Distribution testing/Training
    data = FileReader(data_file)
    data = Instances(data)
    data = Instances(data, 0, n - (n % folds))
    n = n - (n % folds)
    print data.numInstances()
    len_fold = int(math.floor(n / folds))
    folds_test = []
    folds_train = []
    for i in range(0, n + 1, len_fold)[:-1]:
        folds_test.append(Instances(data, i, len_fold))
        f = open(
            ''.join([
                directory, ''.join(['fold_test_',
                                    str(i / len_fold), '.arff'])
            ]), "w")
        f.write(str(folds_test[-1]))
        f.close()
Beispiel #26
0
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()
crossvalidate = sys.argv[2]
rand = Random()  # seed from the system time

# load properties
p = Properties()
p.load(open('./ml.properties'))

# load data file
print "Loading data..."
trainfile = FileReader(sys.argv[1] + "-train.arff")
print "Loading " + sys.argv[1] + "-train.arff"
testfile = FileReader(sys.argv[1] + "-test.arff")
print "Loading " + sys.argv[1] + "-test.arff"
fulltrainset = Instances(trainfile)
fulltrainset.setClassIndex(fulltrainset.numAttributes() - 1)
testset = Instances(testfile)
testset.setClassIndex(testset.numAttributes() - 1)

# open output files
bufsize = 0
classifiername = str(os.path.splitext(os.path.basename(__file__))[0])
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])
datafilelimit = "data/plot/" + classifiername + "_" + dataname + crossvalidate + "_instances.csv"
filelimit = open(datafilelimit, 'w', bufsize)
filelimit.write(
    "instances,lineartest,lineartrain,polytest,polytrain,radialtest,radialtrain,sigmoidtest,sigmoidtrain\n"
)
logfile = "logs/" + classifiername + "_" + dataname + crossvalidate + ".log"
log = open(logfile, 'w', bufsize)  # open general log file
Beispiel #27
0
import weka.core.Instances as Instances
import weka.classifiers.trees.J48 as J48
import weka.classifiers.Evaluation as Evaluation
import weka.core.Range as Range
import weka.classifiers.functions.MultilayerPerceptron as MultilayerPerceptron
import weka.core.SerializationHelper as SerializationHelper

# check commandline parameters
if (not (len(sys.argv) == 3)):
    print "Usage: weka.py <ARFF-file>"
    sys.exit()

file = FileReader(sys.argv[1])
file2 = FileReader(sys.argv[2])
data = Instances(file)
test = Instances(file2)
data.setClassIndex(data.numAttributes() - 1)
test.setClassIndex(test.numAttributes() - 1)
evaluation = Evaluation(data)
buffer = StringBuffer()
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
nn = MultilayerPerceptron()
nn.buildClassifier(data)  # only a trained classifier can be evaluated

#print evaluation.evaluateModel(nn, ['-t', sys.argv[1], '-T', sys.argv[2]])#;, [buffer, attRange, outputDistribution])
res = evaluation.evaluateModel(nn, test, [buffer, attRange, outputDistribution])
f = open('predictions/' + data.relationName(), 'w')
for d in res:
	f.write(str(d) + '\n');
# Define a WEKA Attribute for each feature (each op, 14 total plus the class)
attribute_names = ["attr-%i" % (i + 1) for i in xrange(14)]
attributes = ArrayList()
for name in attribute_names:
    attributes.add(Attribute(name))
# Add an attribute at the end for the classification classes
attributes.add(
    Attribute("class",
              ["membrane", "mit-boundary", "mit-inside", "cytoplasm"]))

# Create the training data structure
# which consists of 16 samples for each membrane training image rotation
# and 4 samples for each mitochondrial boundary image rotation
# and times 2 to then add examples of the other, non-membrane class
training_data = Instances(
    "training", attributes,
    (len(synth_imgs_membrane) * 16 + len(synth_imgs_mit_boundary) * 4) * 2)
training_data.setClassIndex(len(attributes) - 1)


def populateInstances(instances, synth_imgs, class_index, mins, maxs):
    # Populate the training data: create the filter bank for each feature image
    # by reading values from the interval defined by mins and maxs
    target = ArrayImgs.floats([width, height])
    interval = FinalInterval(mins, maxs)
    n_samples = Intervals.numElements(interval)
    for img in synth_imgs:
        vectors = [zeros(len(attributes), 'd') for _ in xrange(n_samples)]
        for k, op in enumerate(filterBank(img, sumType=DoubleType())):
            imgOp = compute(op).into(target)
            for i, v in enumerate(Views.interval(imgOp, interval)):
Beispiel #29
0
    def buildClassifier(self, instances):
        """
        builds the ZeroR classifier with the given data
        
        Parameter(s):
        
            'instances' -- the data to build the classifier from
        """
        
        self.getCapabilities().testWithFail(instances)
    
        # remove instances with missing class
        instances = Instances(instances)
        instances.deleteWithMissingClass()
        
        sumOfWeights      = 0
        self.__Class      = instances.classAttribute()
        self.__ClassValue = 0
        self.__Counts     = None
        
        if (instances.classAttribute().isNumeric()):
            self.__Counts = None
        elif (instances.classAttribute().isNominal()):
            self.__Counts = jarray.zeros(instances.numClasses(), 'd')
            for i in range(len(self.__Counts)):
                self.__Counts[i] = 1
            sumOfWeights = instances.numClasses()

        enu = instances.enumerateInstances()
        while (enu.hasMoreElements()):
            instance = enu.nextElement()
            if (not instance.classIsMissing()):
                if (instances.classAttribute().isNominal()):
                    self.__Counts[int(instance.classValue())] += instance.weight()
                else:
                    self.__ClassValue += instance.weight() * instance.classValue()
                sumOfWeights += instance.weight()
            
        if (instances.classAttribute().isNumeric()):
            if (Utils.gr(sumOfWeights, 0)):
                self.__ClassValue /= sumOfWeights
        else:
            self.__ClassValue = Utils.maxIndex(self.__Counts)
            Utils.normalize(self.__Counts, sumOfWeights)
                    
        return
		elif o in ("-o","--outputArff"):
			outputFn = a
			numReqOpt = numReqOpt + 1
    		else:
      			assert False, "unhandled option"

  	if (numReqOpt < 2):
    		usage()
    		return 1


        options = {'idFlag':True, 'weightFlag': False, 'rmClassFlag': False, 'rmClass': 0}
        # read the first dataset
        fn = inputList[0]
        fid = FileReader(fn)
	Data = Instances(fid)
        Data, IDs = PreprocessData(Data,options)
        # remove class label
        attributeremove = AttributeRemove()
        attributeremove.setInvertSelection(Boolean(False))  # remove class labels from dataset
        attributeremove.setAttributeIndices(String(str(Data.numAttributes())))
        attributeremove.setInputFormat(Data)
        newData = Filter.useFilter(Data, attributeremove)
        # loop over input arff file
        cnt = Data.numAttributes() 
        for fnCnt in range(1,len(inputList)):
             fn = inputList[fnCnt]
             fid = FileReader(fn)
	     Data = Instances(fid)
             Data, IDs = PreprocessData(Data,options)
             # remove class label
Beispiel #31
0
"""
Commandline parameter(s):

    first parameter must be the ARFF file

"""

# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: supervised.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
datafile = FileReader(sys.argv[1])
data = Instances(datafile)
rand = Random()  # seed from the system time
data.randomize(rand)  # randomize data with number generator

# open output files
bufsize = 0
dataname = str(os.path.splitext(os.path.basename(sys.argv[1]))[0])

# loop for different amounts of data with fixed test set
datasize = data.numInstances()
limit = (datasize * 2) / 3  # loop until we use 2/3 data as training set
testset = Instances(data, limit, datasize -
                    limit)  # create training set using the last 1/3 of data
testset.setClassIndex(testset.numAttributes() - 1)

saver = ArffSaver()
Beispiel #32
0
	## Data Distribution testing/Training
	data = FileReader(data_file)
	data = Instances(data)
	data = Instances(data , 0 , n  - (n % folds) )
	n = n- (n % folds)
	print data.numInstances()
	len_fold = int(math.floor(n/folds))
	folds_test = []
	folds_train = []
	for i in range(0,n+1,len_fold)[:-1]:
		folds_test.append(Instances(data,i,len_fold))
		f = open(''.join([directory , ''.join(['fold_test_' , str(i/len_fold) , '.arff'])]) , "w")
		f.write(str(folds_test[-1]));
		f.close()
		temp = Instances(data, 0 , n)
		for j in range(i,i+len_fold,1):
			temp.delete(i)
		folds_train.append(temp)	
		f = open(''.join([directory , ''.join(['fold_train_' , str(i/len_fold) , '.arff'])]) , "w")
		f.write(str(folds_train[-1]));
		f.close()


	## Prediction
	buffers = [] ## List of per fold predictions
	weights = [] ## List of per fold weights per attribute


	for fld in range(0,folds):
		train =  folds_train[fld]