def printAsOriginal(self, out, int): # Printing the header as the original one # print(self.header) if self.storeAttributesAsNonStatic and self.attributes is not None: if self.printInOut == 1 or self.printInOut == 3: print(self.attributes.getInputHeader()) if self.printInOut == 2 or self.printInOut == 3: print(self.attributes.getOutputHeader()) else: if self.printInOut == 1 or self.printInOut == 3: out.println(Attributes.getInputHeader()) if self.printInOut == 2 or self.printInOut == 3: out.println(Attributes.getOutputHeader()) print("@data") for i in range(0, len(self.instanceSet)): print() if self.storeAttributesAsNonStatic and self.attributes is not None: self.instanceSet[i].printAsOriginal(self.attributes, out) else: self.instanceSet[i].printAsOriginal(out)
def getNewHeader(self): line = "" attrs = [] # Getting the relation name and the attributes if self.storeAttributesAsNonStatic and self.attributes is not None: line = "@relation " + self.attributes.getRelationName() + "\n" attrs = self.attributes.getInputAttributes(Attributes) else: line = "@relation " + Attributes.getRelationName() + "\n" attrs = Attributes.getInputAttributes(Attributes) for i in range(0, attrs.length): line += attrs[i].toString() + "\n" # Gettin all the outputs attributes if self.storeAttributesAsNonStatic and self.attributes is not None: attrs = self.attributes.getOutputAttributes() line += attrs[0].toString() + "\n" # Getting @inputs and @outputs line += self.attributes.getInputHeader() + "\n" line += self.attributes.getOutputHeader() + "\n" else: attrs = Attributes.getOutputAttributes() line += str(attrs[0]) + "\n" # Getting @inputs and @outputs line += Attributes.getInputHeader() + "\n" line += Attributes.getOutputHeader() + "\n" return line
def get_ranges(self): # print("self.get_nvars()" + str(self.get_nvars())) rangos = [[0.0 for y in range(2)] for x in range(self.get_nvars())] # print("rangos has two dimensions, first is self.get_nvars()==" + str(self.getn_inputs()) + ",second is 2") ninputs = self.get_ninputs() for i in range(0, ninputs): # print("self.getn_inputs() is :" + str(nInputs) + " i = " + str(i)) attHere = Attributes.getInputAttribute(Attributes, i) # print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues())) if attHere.getNumNominalValues() > 0: rangos[i][0] = 0.0 rangos[i][1] = attHere.getNumNominalValues() - 1 # print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) else: rangos[i][0] = attHere.getMinAttribute() rangos[i][1] = attHere.getMaxAttribute() # print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) rangos[self.get_nvars() - 1][0] = Attributes.getOutputAttribute( Attributes, 0).getMinAttribute() rangos[self.get_nvars() - 1][1] = Attributes.getOutputAttribute( Attributes, 0).getMaxAttribute() return rangos
def returnRanks(self): print("self.getnVars()" + str(self.getnVars())) rangos = [[0.0 for y in range(2)] for x in range(self.getnVars())] print("rangos has two dimensions, first is self.getnVars()==" + str(self.getnVars()) + ",second is 2") for i in range(0, self.getnInputs()): print("self.getnInputs()" + str(self.getnInputs()) + " i = " + str(i)) attHere = Attributes.getInputAttribute(Attributes, i) print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues())) if attHere.getNumNominalValues() > 0: rangos[i][0] = 0.0 rangos[i][1] = attHere.getNumNominalValues() - 1 print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) else: rangos[i][0] = attHere.getMinAttribute() rangos[i][1] = attHere.getMaxAttribute() print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) att0 = Attributes.getOutputAttribute(Attributes, 0) print("self.getnVars() -1" + str(self.getnVars() - 1)) rangos[self.getnVars() - 1][0] = att0.getMinAttribute() print(" rangos[self.getnVars() -1][0] " + str(rangos[self.getnVars() - 1][0])) rangos[self.getnVars() - 1][1] = att0.getMaxAttribute() print(" rangos[self.getnVars() -1][1] " + str(rangos[self.getnVars() - 1][1])) return rangos
def get_granularity_zone_ranges(self, data_set_x_array): # print("self.get_nvars()" + str(self.get_nvars())) rangos = [[0.0 for y in range(2)] for x in range(self.get_nvars())] # print("rangos has two dimensions, first is self.get_nvars()==" + str(self.get_nvars()) + ",second is 2") nInputs = self.getn_inputs() for i in range(0, nInputs): # print("self.getn_inputs() is :" + str(nInputs) + " i = " + str(i)) attHere = Attributes.getInputAttribute(Attributes, i) # print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues())) if attHere.getNumNominalValues() > 0: rangos[i][0] = 0.0 rangos[i][1] = attHere.getNumNominalValues() - 1 # print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) else: rangos[i][0] = attHere.get_min_granularity_attribute( data_set_x_array, i) rangos[i][1] = attHere.get_max_granularity_attribute( data_set_x_array, i) # print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) last_min_value = Attributes.getOutputAttribute(Attributes, 0).getMinAttribute() last_max_value = Attributes.getOutputAttribute(Attributes, 0).getMaxAttribute() # print("The last_min_value is " + str(last_min_value)+" The last_max_value is " + str(last_max_value)) rangos[self.get_nvars() - 1][0] = last_min_value rangos[self.get_nvars() - 1][1] = last_max_value return rangos
def insertInputOutput(self, line, lineCount, collection, type, isTrain): # print(" processing insertInputOutput: " + line) # Declaring StringTokenizer st = line.split(",") for attName in st: attName = str(attName.strip()) # print("attrName: " + attName) attrItem = Attributes.getAttributeByName(Attributes, attName) attributes = Attributes.getAttributes(Attributes) # for att in attributes: # print("att name is :" + str(att.getName())) # print("numbers of items that attributes:"+str(len(attributes))) if attrItem is None: # print("Attributes.getAttribute == None") # If this attribute has not been declared, generate error er = ErrorInfo(ErrorInfo.InputTestAttributeNotDefined, 0, lineCount, 0, 0, isTrain, ("The attribute " + attName + " defined in @" + type + " in test, it has not been defined in @inputs in its train dataset. It will be ignored")) InstanceSet.errorLogger.setError(er) else: # for itemCollection in collection: # print("Item in collection is " + itemCollection) # print("Attributes.getAttribute != None") # print(" > " + str(type) + " attribute considered: " + attName) if attName not in collection: # print("attName:" + attName + " is not in collection") collection.append(attName)
def get_type(self, variable): if self.attributes.getAttributeByPos( variable).getType() == Attributes.getAttributeByPos(0).INTEGER: return self.INTEGER if self.attributes.getAttributeByPos( variable).getType() == Attributes.getAttributeByPos(0).REAL: return self.REAL if self.attributes.getAttributeByPos( variable).getType() == Attributes.getAttributeByPos(0).NOMINAL: return self.NOMINAL return 0
def getClasses(self): classes = ["" for x in range(self.__nClasses)] print(" getClasses,self.__nClasses: " + str(self.__nClasses)) for i in range(0, self.__nClasses): classes[i] = Attributes.getOutputAttribute(Attributes, 0).getNominalValue(i) return classes
def printInsSet(self): print("------------- ATTRIBUTES --------------") if self.storeAttributesAsNonStatic and self.attributes is not None: self.attributes.printAttributes() else: Attributes.printAttributes() print("-------------- INSTANCES --------------") for i in range(0, self.instanceSet.length): print("\n> Instance " + str(i) + ":") if self.storeAttributesAsNonStatic and self.attributes is not None: self.instanceSet[i].printInsSet(self.attributes) else: self.instanceSet[i].printInsSet()
def get_classes(self): clases = ["" for x in range(self.nclasses)] # print(" getClasses,self.nclasses: " + str(self.nclasses)) for i in range(0, self.nclasses): # print(" getClasses method i is "+str(i)) clases[i] = Attributes.getOutputAttribute(Attributes, 0).getNominalValue(i) return clases
def getOriginalHeaderWithoutInOut(self): line = "" attrs = [] # Getting the relation name and the attributes if self.storeAttributesAsNonStatic and self.attributes is not None: line = "@relation " + self.attributes.getRelationName() + "\n" attrs = self.attributes.getAttributes() else: line = "@relation " + Attributes.getRelationName() + "\n" attrs = Attributes.getAttributes() for i in range(0, len(attrs)): line = line + str(attrs[i]) + "\n" return line
def processInputsAndOutputs(self, isTrain, inputsDef, outputsDef, outputAttrNames, inputAttrNames): # After parsing the header, the inputs and the outputs are prepared. print("Processing inputs and outputs") self.outputInfered = False #set default value if isTrain is True: #print("isTrain is True") if inputsDef == False and outputsDef == False: #print("is neither inputAtt no outputAtt") posHere = Attributes.getNumAttributes(self) - 1 outputAttrNames.append( Attributes.getAttributeByPos(self, posHere).getName()) inputAttrNames = Attributes.getAttributesExcept( Attributes, outputAttrNames) self.outputInfered = True elif inputsDef == False and outputsDef == True: #print("inputsDef == False and outputsDef == True") inputAttrNames = Attributes.getAttributesExcept( Attributes, outputAttrNames) elif inputsDef == True and outputsDef == False: #print("inputsDef == True and outputsDef == False") outputAttrNames = Attributes.getAttributesExcept( Attributes, inputAttrNames) self.outputInfered = True #print("setOutputInputAttributes begin: ") Attributes.setOutputInputAttributes(Attributes, inputAttrNames, outputAttrNames)
def copy_header(self): p = "" # # print("copyHeader begin...., P is :" + p) p = "@relation " + Attributes.getRelationName(Attributes) + "\n" # # print(" after relation P is :" + p) p += Attributes.getInputAttributesHeader(Attributes) # # print(" after getInputAttributesHeader P is :" + p) p += Attributes.getOutputAttributesHeader(Attributes) # # print(" after getOutputAttributesHeader P is :" + p) p += Attributes.getInputHeader(Attributes) + "\n" # # print(" after getInputHeader P is :" + p) p += Attributes.getOutputHeader(Attributes) + "\n" # # print(" after getOutputHeader P is :" + p) p += "@data\n" # print("P is :" + p) return p
def removeAttribute(self, tSet, inputAtt, whichAtt): attToDel = None # Getting a reference to the attribute to del if (inputAtt == True): if (self.storeAttributesAsNonStatic and self.attributes != None): attToDel = self.attributes.getInputAttribute(whichAtt) else: attToDel = Attributes.getInputAttribute(whichAtt) else: if (self.storeAttributesAsNonStatic and self.attributes != None): attToDel = self.attributes.getOutputAttribute(whichAtt) else: attToDel = Attributes.getOutputAttribute(whichAtt) if (self.storeAttributesAsNonStatic == True and self.attributes != None): print("Removing the attribute") if (self.attributes.removeAttribute(inputAtt, whichAtt) == False or (tSet != None and tSet.attributes.removeAttribute( inputAtt, whichAtt)) == False): return False else: if (Attributes.removeAttribute(inputAtt, whichAtt) == False): return False for i in range(0, len(self.instanceSet)): if (self.storeAttributesAsNonStatic and self.attributes != None): self.instanceSet[i].removeAttribute(self.attributes, attToDel, inputAtt, whichAtt) else: self.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) if (tSet != None): for i in range(0, tSet.instanceSet.length): if (self.storeAttributesAsNonStatic == True and self.attributes != None): tSet.instanceSet[i].removeAttribute( self.attributes, attToDel, inputAtt, whichAtt) else: tSet.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) return True
def removeAttribute(self, tSet, inputAtt, whichAtt): attToDel = None # Getting a reference to the attribute to del if inputAtt: if self.storeAttributesAsNonStatic and self.attributes is not None: attToDel = self.attributes.getInputAttribute(whichAtt) else: attToDel = Attributes.getInputAttribute(whichAtt) else: if self.storeAttributesAsNonStatic and self.attributes is not None: attToDel = self.attributes.getOutputAttribute(whichAtt) else: attToDel = Attributes.getOutputAttribute(whichAtt) if self.storeAttributesAsNonStatic and self.attributes is not None: print("Removing the attribute") if (not self.attributes.removeAttribute(inputAtt, whichAtt) or (tSet is not None and not tSet.attributes.removeAttribute(inputAtt, whichAtt))): return False else: if not Attributes.removeAttribute(inputAtt, whichAtt): return False for i in range(0, len(self.instanceSet)): if self.storeAttributesAsNonStatic and self.attributes is not None: self.instanceSet[i].removeAttribute(self.attributes, attToDel, inputAtt, whichAtt) else: self.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) if tSet is not None: for i in range(0, tSet.instanceSet.length): if self.storeAttributesAsNonStatic and self.attributes is not None: tSet.instanceSet[i].removeAttribute( self.attributes, attToDel, inputAtt, whichAtt) else: tSet.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) return True
def getType(self, variable): if (Attributes.getAttribute(variable).getType() == Attributes.getAttributeByPos(Attributes, 0).INTEGER): return self.INTEGER if (Attributes.getAttribute(variable).getType() == Attributes.getAttributeByPos(Attributes, 0).REAL): return self.REAL if (Attributes.getAttribute(variable).getType() == Attributes.getAttributeByPos(Attributes, 0).NOMINAL): return self.NOMINAL return 0
def read_classification_set(self, dataset_file, train, file_path): try: # Load in memory a dataset that contains a classification problem print("Inside read_classification_set, datasetFile :" + str(dataset_file)) # print("train is :" + str(train)) # print("object instanceSet is :" + str(self.instance_set)) if self.instance_set is None: print("self.instance_set is Null") else: no_outputs = None print("self.instance_set is not None, train = " + str(train)) self.instance_set.read_set(dataset_file, train, file_path) print( "begin getNumInstances ...... in read_classification_set ") self.ndata = self.instance_set.getNumInstances() print( "In readCread_classification_setlassificationSet , self.ndata is : " + str(self.ndata)) self.ninputs = Attributes.getInputNumAttributes(Attributes) print("In read_classification_set , self.ninputs is : " + str(self.ninputs)) self.nvars = self.ninputs + Attributes.getOutputNumAttributes( Attributes) print("In read_classification_set , self.nvars is : " + str(self.nvars)) # outputInteger check that there is only one output variable if Attributes.getOutputNumAttributes(Attributes) > 1: outAttrs = Attributes.getOutputAttributes(Attributes) # print("Output Attributes number is bigger than 1") i = 1 for outAtt in outAttrs: # print("Att" + str(i) + str(outAtt.getName())) i = i + 1 # print("" + Attributes.getOutputAttributesHeader(Attributes)) print( "This algorithm can not process MIMO datasets !!! exit 1" ) # print("All outputs but the first one will be removed") exit(1) no_outputs = False if Attributes.getOutputNumAttributes(Attributes) < 1: print( "This algorithm can not process datasets without outputs !!!!!!" ) # print("Zero-valued output generated") no_outputs = True exit(1) # print("define all the array in MyDataSet class......") # Initialice and fill our own tables # print("The two dimension array X, dimension 1 is :" + str(self.ndata) + " ,Dimension 2 is :" + str(self.ninputs)) ndata_length = self.ndata ninput_length = self.ninputs print("nDataLength = " + str(ndata_length)) # print("nInputLength = " + str(nInputLength)) # [[0 for j in range(m)] for i in range(n)] first column, then row self.x_array = [[0.0 for y in range(ninput_length)] for x in range(ndata_length)] self.missing_array = [[True for y in range(ninput_length)] for x in range(ndata_length)] self.nominal_array = [True for x in range(ninput_length)] self.integer_array = [True for x in range(ninput_length)] self.output_integer_array = [0 for x in range(ndata_length)] self.output_real_array = [0.0 for x in range(ndata_length)] self.output_array = ["" for x in range(ndata_length)] # Maximum and minimum of inputs self.emax = [0.0 for x in range(ninput_length)] self.emin = [0.0 for x in range(ninput_length)] for i in range(0, ninput_length): attribute_instance: Attribute = Attributes.getInputAttribute( Attributes, i) if attribute_instance.getNumNominalValues() > 0: self.emin[i] = 0 self.emax[i] = Attributes.getInputAttribute( i).getNumNominalValues() - 1 else: self.emax[i] = Attributes.getAttributeByPos( Attributes, i).getMaxAttribute() self.emin[i] = Attributes.getAttributeByPos( Attributes, i).getMinAttribute() if attribute_instance.getType() == Attribute.NOMINAL: self.nominal_array[i] = True self.integer_array[i] = False elif attribute_instance.getType() == Attribute.INTEGER: self.nominal_array[i] = False self.integer_array[i] = True else: self.nominal_array[i] = False self.integer_array[i] = False # print("self.emax[n]:" + str(self.emax[n])) # print("self.emin[n]:" + str(self.emin[n])) # All values are casted into double/integer self.nclasses = 0 for i in range(0, ndata_length): inst = self.instance_set.getInstance(i) for j in range(0, ninput_length): input_Numeric_Value = self.instance_set.getInputNumericValue( i, j) # # print("self.x_array [i] = " + str(i) + ",[j] = " + str(j) + ",input_Numeric_Value:" + str( # input_Numeric_Value)) self.x_array[i][ j] = input_Numeric_Value # inst.getInputRealValues(j); # # print("after get self.x_array[i][j]") self.missing_array[i][ j] = inst.getInputMissingValuesWithPos(j) # # print("after self.missing_array[i][j]") if self.missing_array[i][j]: self.x_array[i][j] = self.emin[j] - 1 if no_outputs: # print("no_outputs==True") self.output_integer_array[i] = 0 # elf.output_real_array[i] = 0.0 self.output_array[i] = "" else: # print("no_outputs==False") self.output_integer_array[ i] = self.instance_set.getOutputNumericValue(i, 0) # print(" 202001-1 self.output_integer_array[ "+str(i)+"]"+ str( self.output_integer_array[i])) # self.output_real_array[i] = self.instance_set.getOutputNumericValue(i, 0) # print("self.output_integer_array[" + str(i) + "] = " + str(self.output_integer_array[i])) self.output_array[ i] = self.instance_set.getOutputNominalValue(i, 0) # print(" 202001-1 self.output_integer_array[ " + str(i) + "]" + str(self.output_integer_array[i])) if self.output_integer_array[i] > self.nclasses: self.nclasses = self.output_integer_array[i] self.nclasses = self.nclasses + 1 print('Number of classes=' + str(self.nclasses)) except Exception as error: print( "read_classification_set: Exception in readSet, in read_classification_set:" + str(error)) # self.computeStatistics() self.compute_instances_per_class()
def get_output_value(self, int_value): # # print("Before att get ") att = Attributes.getOutputAttribute(Attributes, 0) # # print("After att get ") return att.getNominalValue(int_value)
class InstanceSet: # ///////////////////////////////////////////////////////////////////////////// # //////////////// ATTRIBUTES OF THE INSTANCESET CLASS //////////////////////// # ///////////////////////////////////////////////////////////////////////////// # Attribute where all the instances of the DB are stored. instanceSet = [] # String where the header of the file is stored. header = "" # String where only the attributes definition header is stored attHeader = "" # ''' # * Object that collects all the errors happened while reading the test and # * train datasets. # ''' errorLogger = FormatErrorKeeper() # This object contains the attributes definitions attributes = InstanceAttributes() # ''' # * It indicates if the attributes has not be stored as non-static, permiting # * the load of different datasets # ''' storeAttributesAsNonStatic = None # It indicates that the output attribute has been infered as the last one outputInfered = None # ///////////////////////////////////////////////////////////////////////////// # ///////////////// METHODS OF THE INSTANCESET CLASS ////////////////////////// # ///////////////////////////////////////////////////////////////////////////// # It instances a new instance of InstanceSet # data_folder = PureWindowsPath('C:/phd_experiments/threeAlgorithmsComparizasion/threeAlgorithmsComparizasion/ecoli') data_folder = None file_to_open = None data_lines = None # added by rui data_rows = None attributes_insance = None def __init__(self): # print("In __init__ method in InstanceSet.") self.storeAttributesAsNonStatic = False self.attributes = None self.attributes_insance = Attributes() def InstanceSetWithNonSAtrr(self, nonStaticAttributes): self.storeAttributesAsNonStatic = nonStaticAttributes # if ( storeAttributesAsNonStatic ) Attributes.clearAll(); self.attributes = None def InstanceSetWithIns(self, ins): self.instanceSet = ins.instanSet.copy() self.header = str(ins.header) self.attHeader = str(ins.attHeader) self.attributes = str(ins.attributes) self.storeAttributesAsNonStatic = ins.storeAttributesAsNonStatic # end InstanceSet # * InstanceSet # * # * This constructor permit define if the attribute's definition need to be # * stored as non-static (nonStaticAttributes = true). Otherwise, if # * nonStaticAttributes = false, using this constructor is equivalent to use # * the constructor by default. # * Creates a new InstanceSet with the header and Instances from the passed object # * It performs a deep (new allocated) copy. # * @param is Original InstanceSe # * setAttributesAsNonStatic # * # * It stores the static-defined attributes in the class Attributes as # * non static in the object attributes. After this it does not remove the # * static-definition of the Attributes; this is in that way to permit to # * call this functions for differents datasets from the same problem, such # * as, a train dataset and the correspondent test dataset. # */ def setAttributesAsNonStatic(self): self.attributes = InstanceAttributes() self.attributes.copyStaticAttributes() self.storeAttributesAsNonStatic = True # end setAttributesAsNonStatic # /** # * getAttributeDefinitions # * # * It does return the definition of the attibutes contained in the dataset. # * # * @return InstanceAttributes contains the attribute's definitions. def getAttributeDefinitions(self): return self.attributes # end InstanceAttributes # * This method reads all the information in a DB and load it to memory. # * @param fileName is the database file name. # * @param isTrain is a flag that indicate if the database is for a train or for a test. # * @throws DatasetException if there is any semantical error in the input file. # * @throws HeaderFormatException if there is any lexical or sintactical error in the # * header of the input file def read_set(self, fileName, isTrain, file_path): print("Before try in readSet of InstanceSet, file_path is :" + str(file_path) + ".") print("Opening the file in readSet of InstanceSet: " + str(fileName) + ".") try: # Parsing the header of the DB. errorLogger = FormatErrorKeeper() self.file_to_open = Path.cwd() / file_path / fileName # Declaring an instance parser print("In readSet,file_to_open is:" + str(self.file_to_open)) # to do The exception in init InstanceParserof InstanceParse is: can only concatenate str (not "WindowsPath") to str instance_parser = InstanceParser(self.file_to_open, isTrain) # Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs # print("In readSet finished read file " + str(self.file_to_open)) self.parseHeader(instance_parser, isTrain) # print(" The number of output attributes is: " + str(Attributes.getOutputNumAttributes(Attributes))) # The attributes statistics are init if we are in train mode. print("In readSet, isTrain is " + str(isTrain)) if isTrain and self.attributes_insance.getOutputNumAttributes( ) == 1: print("Begin Attributes.initStatistics......") self.attributes_insance.initStatistics() # A temporal vector is used to store the instances read. print("Reading the data") tempSet = [] print("begin instance_parser.getLines()...... ") new_data_lines = [] print("********* There are : " + str(len(self.data_lines)) + "In original Data lines ********* ") for line in self.data_lines: print(" The line is :" + line) if ("@relation" not in line) and ( "@attribute" not in line) and ("@inputs" not in line) and ( "@outputs" not in line) and ("@data" not in line): new_data_lines.append(line) print("********* There are : " + str(len(new_data_lines)) + " In new Data lines ********* ") for line in new_data_lines: if new_data_lines is not None: #print("Data line: " + str(line)) newInstance = Instance() #print("how many data already in the instanceSet: " + str(len(tempSet))) newInstance.setThreeParameters(line, isTrain, len(tempSet)) tempSet.append(newInstance) # The vector of instances is converted to an array of instances. sizeInstance = len(tempSet) print(" Number of instances read: " + str(sizeInstance)) self.instanceSet = [] for i in range(0, sizeInstance): self.instanceSet.append(tempSet[i]) print("After converting all instances") if self.errorLogger.getNumErrors() > 0: errorNumber = len(errorLogger.getAllErrors()) # print("There has been " + str(errorNumber) + "errors in the Dataset format.") for k in range(0, errorLogger.getNumErrors()): errorLogger.getError(k).printErrorInfo() # print("There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format", # errorLogger.getAllErrors()); # print("Finishing the statistics: (isTrain)" + str(isTrain) + ", (# out attributes)" + str(Attributes.getOutputNumAttributes(Attributes))) # # If being on a train dataset, the statistics are finished if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: Attributes.finishStatistics(Attributes) # # close the stream instance_parser.close() # print("File LOADED CORRECTLY!!") except Exception as e: print("Unexpected error in readSet of InstanceSet class :" + str(e)) # end of InstanceSet constructor. # * It reads the information in the header of the file. # * It reads relation's name, attributes' names, and inputs and outputs. # * # * @param parser is the parser of the data set # * @param isTrain is a boolean indicating if this is a train set (and so # * parameters information must be read) or a test set (parameters information # * has not to be read). # read set from data row array for granularity def read_set_from_data_row_array(self, data_raw_array, isTrain): # print("Before try in read_set_from_data_row_array of InstanceSet") try: # Parsing the header of the DB. errorLogger = FormatErrorKeeper() # Declaring an instance parser # to do The exception in init InstanceParserof InstanceParse is: can only concatenate str (not "WindowsPath") to str instance_parser = InstanceParser.init_for_granularity_parser( data_raw_array, isTrain) # Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs # print("data_raw_array size" + str(len(data_raw_array))) self.parse_header_from_data_row_array(instance_parser, isTrain) # print(" The number of output attributes is: " + str(Attributes.getOutputNumAttributes(Attributes))) # The attributes statistics are init if we are in train mode. # print("In readSet, isTrain is " + str(isTrain)) if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: # print("Begin Attributes.initStatistics......") Attributes.initStatistics(Attributes) # A temporal vector is used to store the instances read. # print("Reading the data in read_set_from_data_row_array") tempSet = [] # print("begin instance_parser.getLines()...... ") data_raw_array = self.data_rows new_data_rows = [] number_of_rows = len(data_raw_array) # print("********* There are : " + str(number_of_rows) + "In original Data rows ********* ") # print("********* There are : " + str(len(new_data_rows)) + " In new Data rows ********* ") for i in range(0, number_of_rows): if len(new_data_rows) != 0: # print("Data row: " + str(data_raw_array[i])) newInstance = Instance() # print("how many data already in the instanceSet: " + str(len(tempSet))) newInstance.set_three_parameters_for_granularity_rules( data_raw_array[i], isTrain, len(tempSet)) tempSet.append(newInstance) # The vector of instances is converted to an array of instances. sizeInstance = len(tempSet) # print(" Number of instances read: " + str(sizeInstance)) self.instanceSet = [] for i in range(0, sizeInstance): self.instanceSet.append(tempSet[i]) # print("After converting all instances") # System.out.println("The error logger has any error: "+errorLogger.getNumErrors()); if self.errorLogger.getNumErrors() > 0: errorNumber = len(errorLogger.getAllErrors()) # print("There has been " + str(errorNumber) + "errors in the Dataset format.") for k in range(0, errorLogger.getNumErrors()): errorLogger.getError(k).printErrorInfo() # print("There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format", # errorLogger.getAllErrors()); # print("Finishing the statistics: (isTrain)" + str(isTrain) + ", (# out attributes)" + str(Attributes.getOutputNumAttributes(Attributes))) # # If being on a train dataset, the statistics are finished if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: Attributes.finishStatistics(Attributes) # # close the stream instance_parser.close() # print("File LOADED CORRECTLY!!") except Exception as e: print("Unexpected error in readSet of InstanceSet class :" + str(e)) # end of InstanceSet constructor. def parseHeader(self, parser, isTrain): # 1. Declaration of variables inputAttrNames = [] outputAttrNames = [] inputsDef = False outputsDef = False self.header = "" attCount = 0 lineCount = 0 self.attHeader = None print( "Begin to call the InstanceParser.getLines(),parser.getLines(), in InstanceSet." ) lines = parser.getLines() self.data_lines = lines print( "************************Before for line in lines *************************" ) for line in lines: line = str(line).strip() print("In parseHeader method of InstanceSet, the line is:" + line) if line == "@data".lower(): break else: # print(" Line read: " + line + ".") lineCount = lineCount + 1 if "@relation" in line: if isTrain: relationName = str(line.replace("@relation", "")).strip() # print("set Relation name :" + str(relationName)) self.attributes_insance.setRelationName(relationName) elif "@attribute" in line: if isTrain: # print("Begin insertAttribute ......") self.insertAttribute(line) attCount = attCount + 1 elif "@inputs" in line: # print("@inputs in " + str(line)) self.attHeader = self.header inputsDef = True aux = line[8:] if isTrain: # print("Has @inputs, aux is :" + aux) self.insertInputOutput(aux, lineCount, inputAttrNames, "inputs", isTrain) elif "@outputs" in line: if self.attHeader is None: self.attHeader = self.header outputsDef = True # print("Defining the output in line :" + line) sub_line = line.split() # To get the output attribute name aux = sub_line[1] if isTrain: # print("Has @outputs, aux is :" + aux) self.insertInputOutput(aux, lineCount, outputAttrNames, "outputs", isTrain) # print("Size of the output is: " + str(len(outputAttrNames))) self.header += line + "\n" print( "************************After for line in lines.************************" ) if self.attHeader is None: self.attHeader = self.header self.processInputsAndOutputs(isTrain, inputsDef, outputsDef, outputAttrNames, inputAttrNames) # end headerParse # added by rui for granularity rules def parse_header_from_data_row_array(self, parser, isTrain): # 1. Declaration of variables inputAttrNames = [] outputAttrNames = [] inputsDef = False outputsDef = False self.header = "" attCount = 0 lineCount = 0 self.attHeader = None # print("Begin to call the InstanceParser.getLines(),parser.getLines(), in InstanceSet.") self.data_rows = parser.get_rows() # end parse_header_from_data_row_array def insertAttribute(self, line): # print("Insert attribute begin :") indexL = 0 indexR = 0 type = "" # Treating string and declaring a string tokenizer if "{" in line: token_str = "{" elif "[" in line: token_str = "[" token_withT = "\t" + token_str line = line.replace(token_str, token_withT) # print("token_double is:" + token_withT + ", line is :" + line) # System.out.println (" > Processing line: "+ line ); # st = line.split(" [{\t"); st = line.split( "\t" ) # first we need to split the attribute line into two part , attribute name and attribute values # Disregarding the first token. It is @attribute st[0] = st[0].replace("@attribute", "").strip() # delete @attribute # print("st[0] is:" + st[0]) first_part = st[0].split() at = Attribute() # print("Get type once get instance object, at.getType() = " + str(type_string)) at.setName(first_part[0]) print("att set name as first_part[0] is:" + first_part[0]) # # print( "Attribute name: "+ at.getName() ) # to get the class name values we need to split the second part of the attribute line, to get values of attribute # Next action depends on the type of attribute: continuous or nominal if len( st ) == 1: # Parsing a nominal attribute with no definition of values # print("Parsing nominal attribute without values: setType=0") # print("Get type =" + at.getType()) at.setType(Attribute.NOMINAL) elif "{" in line: # this because it is the class values line # print("Parsing nominal attribute with values: " + line) # print("Get type =" + at.getType()) # print("Before setType = 0") at.setType(Attribute.NOMINAL) # print("after setType= 0") at.setFixedBounds(True) indexL = line.index("{") + 1 # print("indexL: " + indexL ) indexR = line.index("}") # print("indexR: " + str(indexR)) print("indexL : " + str(indexL) + "indexR : " + str(indexR)) # print( "The Nominal values are: " + line[indexL: indexR]); lineSub = line[indexL:indexR] # print("The lineSub : " + lineSub) st2 = lineSub.split(",") for nominalStr in st2: at.addNominalValue(nominalStr.strip()) else: # Parsing an integer or real attType = first_part[1].lower() # print("attribute Name : " + str(first_part[0]) + ", attribute type = " + str(attType)) # System.out.println (" > Parsing "+ type + " attributes"); if attType == "integer": at.setType(Attribute.INTEGER) # print("set integer type") if attType == "real": at.setType(Attribute.REAL) # print("set real type") indexL = line.index("[") indexR = line.index("]") # print("indexL is: " + str(indexL) + " indexR: " + str(indexR)) if indexL != -1 and indexR != -1: # System.out.println ( " > The real values are: " + line.substring( indexL+1, indexR) ); lineSub = line[indexL + 1:indexR] # print("lineSub: " + lineSub) st2 = lineSub.split(",") # print("st2[0].strip() :" + st2[0]) # print("st2[1].strip() :" + st2[1]) minBound = float(st2[0].strip()) maxBound = float(st2[1].strip()) # print("Before at.setBounds(minBound, maxBound): ( " + str(minBound) + " , " + str(maxBound) + " )") at.setBounds(minBound, maxBound) # print("Before add attribute :::: ") self.attributes_insance.addAttribute(at) # print("insertAttribute is finished :::: ") # end insertAttribute def insertInputOutput(self, line, lineCount, collection, type, isTrain): # print(" processing insertInputOutput: " + line) # Declaring StringTokenizer st = line.split(",") for attName in st: attName = str(attName.strip()) # print("attrName: " + attName) attrItem = self.attributes_insance.getAttributeByName(attName) attributes = self.attributes_insance.getAttributes() # for att in attributes: # print("att name is :" + str(att.getName())) # print("numbers of items that attributes:"+str(len(attributes))) if attrItem is None: # print("Attributes.getAttribute == None") # If this attribute has not been declared, generate error er = ErrorInfo( ErrorInfo.InputTestAttributeNotDefined, 0, lineCount, 0, 0, isTrain, ("The attribute " + attName + " defined in @" + type + " in test, it has not been defined in @inputs in its train dataset. It will be ignored" )) InstanceSet.errorLogger.setError(er) else: # for itemCollection in collection: # print("Item in collection is " + itemCollection) # print("Attributes.getAttribute != None") # print(" > " + str(type) + " attribute considered: " + attName) if attName not in collection: # print("attName:" + attName + " is not in collection") collection.append(attName) # end insertInputOutput def processInputsAndOutputs(self, isTrain, inputsDef, outputsDef, outputAttrNames, inputAttrNames): # After parsing the header, the inputs and the outputs are prepared. print("Processing inputs and outputs") self.outputInfered = False # set default value if isTrain: print("isTrain == True") if not inputsDef and not outputsDef: # print("is neither inputAtt no outputAtt") posHere = self.attributes_insance.getNumAttributes() - 1 outputAttrNames.append( self.attributes_insance.getAttributeByPos( posHere).getName()) inputAttrNames = self.attributes_insance.getAttributesExcept( outputAttrNames) self.outputInfered = True elif not inputsDef and outputsDef: # print("inputsDef == False and outputsDef == True") inputAttrNames = self.attributes_insance.getAttributesExcept( outputAttrNames) elif inputsDef and not outputsDef: # print("inputsDef == True and outputsDef == False") outputAttrNames = self.attributes_insance.getAttributesExcept( inputAttrNames) self.outputInfered = True print("setOutputInputAttributes begin: ") self.attributes_insance.setOutputInputAttributes( inputAttrNames, outputAttrNames) # end of processInputsAndOutputs # ''' # * Test if the output attribute has been infered. # * @return True if the output attribute has been infered. False if not. # ''' def isOutputInfered(self): return self.outputInfered # ''' # * It returns the number of instances. # * @return an int with the number of instances. # ''' def getNumInstances(self): if self.instanceSet is not None: instanceNumber = len(self.instanceSet) print("instanceSet is not None, instanceNumber = " + str(instanceNumber)) return instanceNumber else: print("instanceSet is None !!!") return 0 # end numInstances # ''' # * Gets the instance located at the cursor position. # * @return the instance located at the cursor position. # ''' def getInstance(self, whichInstance): if whichInstance < 0 or whichInstance >= len(self.instanceSet): return None return self.instanceSet[whichInstance] # end getInstance # * It returns all the instances of the class. # * @return Instance[] with all the instances of the class. def getInstances(self): return self.instanceSet # //end getInstances # ''' # ''' # * Returns the value of an integer or a real input attribute of an instance # * in the instanceSet. # * @param whichInst is the position of the instance. # * @param whichAttr is the position of the input attribute. # * @return a String with the numeric value. # * @throws ArrayIndexOutOfBoundsException If the index is out of the instance # * set size. # ''' def getInputNumericValue(self, whichInst, whichAttr): # print("InstanceSet, getInputNumericValue begin...") instance_number = len(self.instanceSet) # print("whichInst = " + str(whichInst) + ", whichAttr =" + str(whichAttr)) # print("len(self.instanceSet) = " + str(instance_number)) if whichInst < 0 or whichInst >= instance_number: raise IndexError("You are trying to access to " + whichInst + " instance and there are only " + str(instance_number) + ".") instanceHere = self.instanceSet[whichInst] #print("instanceHere = " + str(instanceHere)) numericValue = 0.0 try: numericValue = instanceHere.getInputRealValues(whichAttr) except Exception as error: print("getInputRealValues has exception!! : " + str(error)) return numericValue # end getInputNumericValue # ''' # * Returns the value of an integer or a real output attribute of an instance # * in the instanceSet. # * @param whichInst is the position of the instance. # * @param whichAttr is the position of the output attribute. # * @return a String with the numeric value. # * @throws ArrayIndexOutOfBoundsException If the index is out of the instance # * set size. # ''' def getOutputNumericValue(self, whichInst, whichAttr): if whichInst < 0 or whichInst >= len(self.instanceSet): print( self.ArrayIndexOutOfBoundsException( "You are trying to access to " + whichInst + "instance and there are only" + self.instanceSet.length + ".")) return self.instanceSet[whichInst].getOutputRealValues(whichAttr) # end getOutputNumericValue # # ''' # * Returns the value of a nominal input attribute of an instance in the # * instanceSet. # * @param whichInst is the position of the instance. # * @param whichAttr is the position of the input attribute. # * @return a String with the nominal value. # * @throws ArrayIndexOutOfBoundsException If the index is out of the instance # * set size. # ''' def getInputNominalValue(self, whichInst, whichAttr): if whichInst < 0 or whichInst >= len(self.instanceSet): print( self.ArrayIndexOutOfBoundsException( "You are trying to access to " + whichInst + " instance and there are only " + str(len(self.instanceSet)) + ".")) return self.instanceSet[whichInst].getOutputNominalValues(whichAttr) # end getInputNominalValue # # ''' # * Returns the value of a nominal output attribute of an instance in the # * instanceSet. # * @param whichInst is the position of the instance. # * @param whichAttr is the position of the output attribute. # * @return a String with the nominal value. # * @throws ArrayIndexOutOfBoundsException If the index is out of the instance # * set size. # ''' def getOutputNominalValue(self, whichInst, whichAttr): if whichInst < 0 or whichInst >= len(self.instanceSet): print("You are trying to access to " + whichInst + " instance and there are only " + str(len(self.instanceSet)) + ".") return self.instanceSet[whichInst].getOutputNominalValues(whichAttr) # end getOutputNumericValue # ''' # * It does remove the instance i from the instanceSet. # * @param instNum is the instance removed from the instanceSet. # ''' def removeInstance(self, instNum): if instNum < 0 or instNum >= len(self.instanceSet): return aux = [Instance() for x in range(len(self.instanceSet) - 1)] add = 0 for i in range(0, len(self.instanceSet)): if instNum == i: add = 1 else: aux[i - add] = self.instanceSet[i] # Copying the auxiliar to the instanceSet variable self.instanceSet = aux aux = None # avoiding memory leaks (not necessary in this case) # end removeInstance # ''' # * It does remove an attribute. To remove an attribute, the train and the # * test sets have to be passed to mantain the coherence of the system. # * Otherwise, only the attribute of the train set would be removed, leaving # * inconsistent the instances of the test set, because of having one extra # * attribute inexistent anymore. # * # * @param tSet is the test set. # * @param inputAtt is a boolean that is true when the attribute that is # * wanted to be removed is an input attribute. # * @param whichAtt is a integer that indicate the position of the attriubte # * to be deleted. # * @return a boolean indicating if the attribute has been deleted # ''' def removeAttribute(self, tSet, inputAtt, whichAtt): attToDel = None # Getting a reference to the attribute to del if inputAtt: if self.storeAttributesAsNonStatic and self.attributes is not None: attToDel = self.attributes.getInputAttribute(whichAtt) else: attToDel = Attributes.getInputAttribute(whichAtt) else: if self.storeAttributesAsNonStatic and self.attributes is not None: attToDel = self.attributes.getOutputAttribute(whichAtt) else: attToDel = Attributes.getOutputAttribute(whichAtt) if self.storeAttributesAsNonStatic and self.attributes is not None: print("Removing the attribute") if (not self.attributes.removeAttribute(inputAtt, whichAtt) or (tSet is not None and not tSet.attributes.removeAttribute(inputAtt, whichAtt))): return False else: if not Attributes.removeAttribute(inputAtt, whichAtt): return False for i in range(0, len(self.instanceSet)): if self.storeAttributesAsNonStatic and self.attributes is not None: self.instanceSet[i].removeAttribute(self.attributes, attToDel, inputAtt, whichAtt) else: self.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) if tSet is not None: for i in range(0, tSet.instanceSet.length): if self.storeAttributesAsNonStatic and self.attributes is not None: tSet.instanceSet[i].removeAttribute( self.attributes, attToDel, inputAtt, whichAtt) else: tSet.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) return True # end removeAttribute # ''' # * It returns the header. # * @return a String with the header of the file. # ''' def getHeader(self): return self.header # end getHeader def setHeader(self, copia): self.header = str(copia) # end getHeader def getAttHeader(self): return self.attHeader # end getHeader def setAttHeader(self, copia): self.attHeader = str(copia) # end getHeader # ''' # * It does return a new header (not necessary the same header as the # * input file one). It only includes the valid attributes, those ones # * defined in @inputs and @outputs (or taken as that role following the # * keel format specification). # * @return a String with the new header # ''' def getNewHeader(self): line = "" attrs = [] # Getting the relation name and the attributes if self.storeAttributesAsNonStatic and self.attributes is not None: line = "@relation " + self.attributes.getRelationName() + "\n" attrs = self.attributes.getInputAttributes(Attributes) else: line = "@relation " + Attributes.getRelationName() + "\n" attrs = Attributes.getInputAttributes(Attributes) for i in range(0, attrs.length): line += attrs[i].toString() + "\n" # Gettin all the outputs attributes if self.storeAttributesAsNonStatic and self.attributes is not None: attrs = self.attributes.getOutputAttributes() line += attrs[0].toString() + "\n" # Getting @inputs and @outputs line += self.attributes.getInputHeader() + "\n" line += self.attributes.getOutputHeader() + "\n" else: attrs = Attributes.getOutputAttributes() line += str(attrs[0]) + "\n" # Getting @inputs and @outputs line += Attributes.getInputHeader() + "\n" line += Attributes.getOutputHeader() + "\n" return line # end getNewHeader # ''' # * It does return the original header definiton but # * without @input and @output in there # ''' def getOriginalHeaderWithoutInOut(self): line = "" attrs = [] # Getting the relation name and the attributes if self.storeAttributesAsNonStatic and self.attributes is not None: line = "@relation " + self.attributes.getRelationName() + "\n" attrs = self.attributes.getAttributes() else: line = "@relation " + Attributes.getRelationName() + "\n" attrs = Attributes.getAttributes() for i in range(0, len(attrs)): line = line + str(attrs[i]) + "\n" return line # end getOriginalHeaderWithoutInOut; # ''' # * It prints the dataset to the specified PrintWriter # * @param out is the PrintWriter where to print # ''' def printOut(self, out): for i in range(0, len(self.instanceSet)): print("> Instance " + i + ":") if self.storeAttributesAsNonStatic and self.attributes is not None: self.instanceSet[i].printOut(self.attributes, out) else: self.instanceSet[i].printOut(out) # end print # # ''' # * It prints the dataset to the specified PrintWriter. # * The order of the attributes is the same as in the # * original file # * @param out is the PrintWriter where to print # * @param printInOut indicates if the @inputs (1), @outputs(2), # * both of them (3) or any (0) has to be printed # ''' def printAsOriginal(self, out, int): # Printing the header as the original one # print(self.header) if self.storeAttributesAsNonStatic and self.attributes is not None: if self.printInOut == 1 or self.printInOut == 3: print(self.attributes.getInputHeader()) if self.printInOut == 2 or self.printInOut == 3: print(self.attributes.getOutputHeader()) else: if self.printInOut == 1 or self.printInOut == 3: out.println(Attributes.getInputHeader()) if self.printInOut == 2 or self.printInOut == 3: out.println(Attributes.getOutputHeader()) print("@data") for i in range(0, len(self.instanceSet)): print() if self.storeAttributesAsNonStatic and self.attributes is not None: self.instanceSet[i].printAsOriginal(self.attributes, out) else: self.instanceSet[i].printAsOriginal(self.attributes, out) # end printAsOriginal def printInsSet(self): print("------------- ATTRIBUTES --------------") if self.storeAttributesAsNonStatic and self.attributes is not None: self.attributes.printAttributes() else: Attributes.printAttributes() print("-------------- INSTANCES --------------") for i in range(0, self.instanceSet.length): print("\n> Instance " + str(i) + ":") if self.storeAttributesAsNonStatic and self.attributes is not None: self.instanceSet[i].printInsSet(self.attributes) else: self.instanceSet[i].printInsSet() # end print # Remove all instances from this InstanceSet def clearInstances(self): self.instanceSet = None # ''' # * It adds the passed instance at the end of the present InstanceSet # * @param inst the instance to be added # ''' def addInstance(self, inst): i = 0 nVector = [] if self.instanceSet is not None: nVector = [Instance() for x in range(len(self.instanceSet) + 1)] for i in range(0, len(self.instanceSet)): nVector[i] = self.instanceSet[i] else: nVector = Instance[1] nVector[i] = inst self.instanceSet = nVector # ''' # * Clear the non-Static attributes. The static class Attributes is not modified. # ''' def clearNonStaticAttributes(self): self.attributes = None # ''' # * Appends the given attribute to the non-static list of the current InstanceSet # * @param at The Attribute to be Appended # ''' def addAttribute(self, att): if self.attributes is None: self.attributes = InstanceAttributes() self.attributes.addAttribute(att)
def has_numerical_attributes(self): return Attributes.hasIntegerAttributes( self) or Attributes.hasRealAttributes(self)
def number_values(self, attribute): return Attributes.getInputAttribute(attribute).getNumNominalValues( Attributes)
def read_set_from_data_row_array(self, data_raw_array, isTrain): # print("Before try in read_set_from_data_row_array of InstanceSet") try: # Parsing the header of the DB. errorLogger = FormatErrorKeeper() # Declaring an instance parser # to do The exception in init InstanceParserof InstanceParse is: can only concatenate str (not "WindowsPath") to str instance_parser = InstanceParser.init_for_granularity_parser(data_raw_array, isTrain) # Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs # print("data_raw_array size" + str(len(data_raw_array))) self.parse_header_from_data_row_array(instance_parser, isTrain) # print(" The number of output attributes is: " + str(Attributes.getOutputNumAttributes(Attributes))) # The attributes statistics are init if we are in train mode. # print("In readSet, isTrain is " + str(isTrain)) if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: # print("Begin Attributes.initStatistics......") Attributes.initStatistics(Attributes) # A temporal vector is used to store the instances read. # print("Reading the data in read_set_from_data_row_array") tempSet = [] # print("begin instance_parser.getLines()...... ") data_raw_array = self.data_rows new_data_rows = [] number_of_rows= len(data_raw_array) # print("********* There are : " + str(number_of_rows) + "In original Data rows ********* ") # print("********* There are : " + str(len(new_data_rows)) + " In new Data rows ********* ") for i in range(0, number_of_rows): if len(new_data_rows) != 0: # print("Data row: " + str(data_raw_array[i])) newInstance = Instance() # print("how many data already in the instanceSet: " + str(len(tempSet))) newInstance.set_three_parameters_for_granularity_rules(data_raw_array[i], isTrain, len(tempSet)) tempSet.append(newInstance) # The vector of instances is converted to an array of instances. sizeInstance = len(tempSet) # print(" Number of instances read: " + str(sizeInstance)) self.instanceSet = [] for i in range(0, sizeInstance): self.instanceSet.append(tempSet[i]) # print("After converting all instances") # System.out.println("The error logger has any error: "+errorLogger.getNumErrors()); if self.errorLogger.getNumErrors() > 0: errorNumber = len(errorLogger.getAllErrors()) # print("There has been " + str(errorNumber) + "errors in the Dataset format.") for k in range(0, errorLogger.getNumErrors()): errorLogger.getError(k).printErrorInfo() # print("There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format", # errorLogger.getAllErrors()); # print("Finishing the statistics: (isTrain)" + str(isTrain) + ", (# out attributes)" + str(Attributes.getOutputNumAttributes(Attributes))) # # If being on a train dataset, the statistics are finished if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: Attributes.finishStatistics(Attributes) # # close the stream instance_parser.close() # print("File LOADED CORRECTLY!!") except Exception as e: print("Unexpected error in readSet of InstanceSet class :" + str(e))
def has_real_attributes(self): return Attributes.hasRealAttributes(self)
def parseHeader(self, parser, isTrain): # 1. Declaration of variables inputAttrNames = [] outputAttrNames = [] inputsDef = False outputsDef = False self.header = "" attCount = 0 lineCount = 0 self.attHeader = None # print("Begin to call the InstanceParser.getLines(),parser.getLines(), in InstanceSet.") lines = parser.getLines() self.data_lines = lines for line in lines: line = str(line).strip() # print("In parseHeader method of InstanceSet, the line is:" + line) if line == "@data".lower(): break else: # print(" Line read: " + line + ".") lineCount = lineCount + 1 if "@relation" in line: if isTrain: relationName = str(line.replace("@relation", "")).strip() # print("set Relation name :" + str(relationName)) Attributes.setRelationName(self, relationName) elif "@attribute" in line: if isTrain: # print("Begin insertAttribute ......") self.insertAttribute(line) attCount = attCount + 1 elif "@inputs" in line: # print("@inputs in " + str(line)) self.attHeader = self.header inputsDef = True aux = line[8:] if isTrain: # print("Has @inputs, aux is :" + aux) self.insertInputOutput(aux, lineCount, inputAttrNames, "inputs", isTrain) elif "@outputs" in line: if self.attHeader is None: self.attHeader = self.header outputsDef = True # print("Defining the output in line :" + line) sub_line = line.split() # To get the output attribute name aux = sub_line[1] if isTrain: # print("Has @outputs, aux is :" + aux) self.insertInputOutput(aux, lineCount, outputAttrNames, "outputs", isTrain) # print("Size of the output is: " + str(len(outputAttrNames))) self.header += line + "\n" if self.attHeader is None: self.attHeader = self.header self.processInputsAndOutputs(isTrain, inputsDef, outputsDef, outputAttrNames, inputAttrNames)
def get_names(self): nombres = ["" for x in range(self.ninputs)] for i in range(0, self.ninputs): nombres[i] = Attributes.getInputAttribute(Attributes, i).getName() return nombres
def readSet(self, fileName, isTrain,file_path): print("Before try in readSet of InstanceSet, fileName is :" + str(fileName) + ".") print("Opening the file in readSet of InstanceSet: " + str(fileName) + ".") try: # Parsing the header of the DB. errorLogger = FormatErrorKeeper() self.data_folder = file_path self.file_to_open = self.data_folder + "\\dataset\\" + fileName # Declaring an instance parser print("In readSet,file_to_open is:" + str(self.file_to_open)) # to do The exception in init InstanceParserof InstanceParse is: can only concatenate str (not "WindowsPath") to str instance_parser = InstanceParser(self.file_to_open, isTrain) # Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs print("In readSet finished read file " + str(self.file_to_open)) self.parseHeader(instance_parser, isTrain) print(" The number of output attributes is: " + str(Attributes.getOutputNumAttributes(Attributes))) # The attributes statistics are init if we are in train mode. print("In readSet, isTrain is " + str(isTrain)) if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: print("Begin Attributes.initStatistics......") Attributes.initStatistics(Attributes) # A temporal vector is used to store the instances read. print("Reading the data") tempSet = [] print("begin instance_parser.getLines()...... ") lines = self.data_lines new_data_lines = [] print("********* There are : " + str(len(lines)) + "In original Data lines ********* ") for line in lines: if ("@relation" not in line) and ("@attribute" not in line) and ("@inputs" not in line) and ( "@outputs" not in line) and ("@data" not in line): new_data_lines.append(line) # print("********* There are : " + str(len(new_data_lines)) + " In new Data lines ********* ") for line in new_data_lines: if new_data_lines is not None: print("Data line: " + str(line)) newInstance = Instance() # print("how many data already in the instanceSet: " + str(len(tempSet))) newInstance.setThreeParameters(line, isTrain, len(tempSet)) tempSet.append(newInstance) # The vector of instances is converted to an array of instances. sizeInstance = len(tempSet) # print(" Number of instances read: " + str(sizeInstance)) self.instanceSet = [] for i in range(0, sizeInstance): self.instanceSet.append(tempSet[i]) # print("After converting all instances") # System.out.println("The error logger has any error: "+errorLogger.getNumErrors()); if self.errorLogger.getNumErrors() > 0: errorNumber = len(errorLogger.getAllErrors()) # print("There has been " + str(errorNumber) + "errors in the Dataset format.") for k in range(0, errorLogger.getNumErrors()): errorLogger.getError(k).printErrorInfo() # print("There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format", # errorLogger.getAllErrors()); # print("Finishing the statistics: (isTrain)" + str(isTrain) + ", (# out attributes)" + str(Attributes.getOutputNumAttributes(Attributes))) # # If being on a train dataset, the statistics are finished if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: Attributes.finishStatistics(Attributes) # # close the stream instance_parser.close() # print("File LOADED CORRECTLY!!") except Exception as e: print("Unexpected error in readSet of InstanceSet class :" + str(e))
def __init__(self): self.instance_set = InstanceSet() self.attributes = Attributes()
class MyDataSet: # Number to represent type of variable real or double. REAL = 0 # *Number to represent type of variable integer.* INTEGER = 1 # *Number to represent type of variable nominal.* NOMINAL = 2 x_array = [] # examples array missing_array = [] # possible missing values output_integer_array = [ ] # output of the data - set as integer values private output_real_array = [] # output of the data - set as double values output_array = [] # output of the data - set as string values emax_array = [] # max value of an attribute private emin_array = [] # min value of an attribute ndata = None # Number of examples nvars = None # Numer of variables ninputs = None # Number of inputs nclasses = None # Number of outputs instance_set = None # The whole instance set stdev_array = [] average_array = [] # standard deviation and average of each attribute instances_cl = [] # nominal attributes bool array nominal_array = [] # integer attributes int array integer_array = [] frequent_class_array = [] attributes = None # *Init a new set of instances def __init__(self): self.instance_set = InstanceSet() self.attributes = Attributes() # ''' # * Outputs an array of examples with their corresponding attribute values. # * @return double[][] an array of examples with their corresponding attribute values # ''' def get_x(self): return self.x_array def set_x(self, x_parameter): self.x_array = x_parameter # ''' # * Output a specific example # * @param pos int position (id) of the example in the data-set # * @return double[] the attributes of the given example # ''' def get_example(self, pos): # # print(" In getExample, len(self.x_array) = " + str(len(self.x_array)) + ", pos = " + str( # pos) + " ," + "self.x_array[pos] ==" + str(self.x_array[pos])) return self.x_array[pos] # * Returns the output of the data-set as integer values # * @return int[] an array of integer values corresponding to the output values of the dataset def get_output_as_integer(self): size = len(self.output_integer_array) output = [0 for x in range(size)] for i in range(0, size): output[i] = self.output_integer_array[i] return output # * Returns the output of the data-set as real values # * @return double[] an array of real values corresponding to the output values of the dataset def get_output_as_real(self): output_length = len(self.output_real_array) output = [0.0 for x in range(output_length)] for i in range(0, len(self.output_real_array)): output[i] = self.output_integer_array[i] return output # * Returns the output of the data-set as nominal values # * @return String[] an array of nomianl values corresponding to the output values of the dataset # def get_output_as_string(self): output_length = len(self.output_array) output = ["" for x in range(output_length)] for i in range(0, output_length): output[i] = self.output_array[i] return output # * It returns the output value of the example "pos" # * @param pos int the position (id) of the example # * @return String a string containing the output value def get_output_as_string_with_pos(self, pos): # # print("pos is in getOutputAsStringWithPos "+str(pos)) # maybe the exception is here. return self.output_array[pos] # * It returns the output value of the example "pos" # * @param pos int the position (id) of the example # * @return int an integer containing the output value def get_output_as_integer_with_pos(self, pos): return self.output_integer_array[pos] def set_output_integer_array(self, integer_array): self.output_integer_array = integer_array def set_output_array(self, output_array): self.output_array = output_array # * It returns the output value of the example "pos" # * @param pos int the position (id) of the example # * @return double a real containing the output value def get_output_as_real_with_pos(self, pos): return self.output_real_array[pos] # *It returns an array with the maximum values of the attributes # * @ return double[] an array with the maximum values of the attributes # def get_emax(self): return self.emax_array # *It returns an array with the minimum values of the attributes # * @ return double[] an array with the minimum values of the attributes def get_emin(self): return self.emin_array # *It returns the maximum value of the given attribute # * # * @ param variable the index of the attribute # * @ return the maximum value of the given attribute def get_max(self, variable): return self.emax_array[variable] # *It returns the minimum value of the given attribute # # * @ param variable the index of the attribute # * @ return the minimum value of the given attribute def get_min(self, variable): return self.emin_array[variable] # *It gets the size of the data - set # * @ return int the number of examples in the data - set def get_ndata(self): return self.ndata def set_ndata(self, ndata): self.ndata = ndata # *It gets the number of variables of the data - set(including the output) # * @ return int the number of variables of the data - set(including the output) # modified at 2020-08-14 def get_nvars(self): return self.nvars # * It gets the number of input attributes of the data-set # * @return int the number of input attributes of the data-set def get_ninputs(self): return self.ninputs def set_ninputs(self, ninputs_value): self.ninputs = ninputs_value # * It gets the number of output attributes of the data-set (for example number of classes in classification) # * @return int the number of different output values of the data-set def get_nclasses(self): return self.nclasses def set_nclasses(self, nclasses_value): self.nclasses = nclasses_value # added by rui for granularity rule generation def calculate_nclasses_for_small_granularity_zone(self, output_integer_array): class_number = 0 class_array = [] has_class = False for i in range(0, len(output_integer_array)): # # print(" output_integer_array[i] " + str(output_integer_array[i])) if len(class_array) == 0: class_array.append(output_integer_array[i]) else: has_class = False for j in range(0, len(class_array)): if class_array[j] == output_integer_array[i]: # # print(" class_array[j] " + str(class_array[j])) has_class = True if not has_class: class_array.append(output_integer_array[i]) class_number = len(class_array) return class_number # * This function checks if the attribute value is missing # * @param i int Example id # * @param j int Variable id # * @return boolean True is the value is missing, else it returns false def is_missing(self, i, j): return self.missing_array[i][j] # * It reads the whole input data-set and it stores each example and its associated output value in # * local arrays to ease their use. # * @param datasetFile String name of the file containing the dataset # * @param train boolean It must have the value "true" if we are reading the training data-set # * @throws IOException If there ocurs any problem with the reading of the data-set def read_classification_set(self, dataset_file, train, file_path): try: # Load in memory a dataset that contains a classification problem print("Inside read_classification_set, datasetFile :" + str(dataset_file)) # print("train is :" + str(train)) # print("object instanceSet is :" + str(self.instance_set)) if self.instance_set is None: print("self.instance_set is Null") else: no_outputs = None print("self.instance_set is not None, train = " + str(train)) self.instance_set.read_set(dataset_file, train, file_path) print( "begin getNumInstances ...... in read_classification_set ") self.ndata = self.instance_set.getNumInstances() print( "In readCread_classification_setlassificationSet , self.ndata is : " + str(self.ndata)) self.ninputs = self.attributes.getInputNumAttributes() print("In read_classification_set , self.ninputs is : " + str(self.ninputs)) self.nvars = self.ninputs + self.attributes.getOutputNumAttributes( ) print("In read_classification_set , self.nvars is : " + str(self.nvars)) # outputInteger check that there is only one output variable if self.attributes.getOutputNumAttributes() > 1: outAttrs = self.attributes.getOutputAttributes() # print("Output Attributes number is bigger than 1") i = 1 for outAtt in outAttrs: # print("Att" + str(i) + str(outAtt.getName())) i = i + 1 # print("" + Attributes.getOutputAttributesHeader(Attributes)) print( "This algorithm can not process MIMO datasets !!! exit 1" ) # print("All outputs but the first one will be removed") exit(1) no_outputs = False if self.attributes.getOutputNumAttributes() < 1: print( "This algorithm can not process datasets without outputs !!!!!!" ) # print("Zero-valued output generated") no_outputs = True exit(1) # print("define all the array in MyDataSet class......") # Initialice and fill our own tables # print("The two dimension array X, dimension 1 is :" + str(self.ndata) + " ,Dimension 2 is :" + str(self.ninputs)) ndata_length = self.ndata ninput_length = self.ninputs print("nDataLength = " + str(ndata_length)) # print("nInputLength = " + str(nInputLength)) # [[0 for j in range(m)] for i in range(n)] first column, then row self.x_array = [[0.0 for y in range(ninput_length)] for x in range(ndata_length)] self.missing_array = [[True for y in range(ninput_length)] for x in range(ndata_length)] self.nominal_array = [True for x in range(ninput_length)] self.integer_array = [True for x in range(ninput_length)] self.output_integer_array = [0 for x in range(ndata_length)] self.output_real_array = [0.0 for x in range(ndata_length)] self.output_array = ["" for x in range(ndata_length)] # Maximum and minimum of inputs self.emax = [0.0 for x in range(ninput_length)] self.emin = [0.0 for x in range(ninput_length)] for i in range(0, ninput_length): attribute_instance = self.attributes.getInputAttribute(i) if attribute_instance.getNumNominalValues() > 0: self.emin[i] = 0 self.emax[i] = self.attributes.getInputAttribute( i).getNumNominalValues() - 1 else: self.emax[i] = self.attributes.getAttributeByPos( i).getMaxAttribute() self.emin[i] = self.attributes.getAttributeByPos( i).getMinAttribute() if attribute_instance.getType() == Attribute.NOMINAL: self.nominal_array[i] = True self.integer_array[i] = False elif attribute_instance.getType() == Attribute.INTEGER: self.nominal_array[i] = False self.integer_array[i] = True else: self.nominal_array[i] = False self.integer_array[i] = False # print("self.emax[n]:" + str(self.emax[n])) # print("self.emin[n]:" + str(self.emin[n])) # All values are casted into double/integer self.nclasses = 0 for i in range(0, ndata_length): inst = self.instance_set.getInstance(i) for j in range(0, ninput_length): input_Numeric_Value = self.instance_set.getInputNumericValue( i, j) # # print("self.x_array [i] = " + str(i) + ",[j] = " + str(j) + ",input_Numeric_Value:" + str( # input_Numeric_Value)) self.x_array[i][ j] = input_Numeric_Value # inst.getInputRealValues(j); # # print("after get self.x_array[i][j]") self.missing_array[i][ j] = inst.getInputMissingValuesWithPos(j) # # print("after self.missing_array[i][j]") if self.missing_array[i][j]: self.x_array[i][j] = self.emin[j] - 1 if no_outputs: # print("no_outputs==True") self.output_integer_array[i] = 0 # elf.output_real_array[i] = 0.0 self.output_array[i] = "" else: # print("no_outputs==False") self.output_integer_array[ i] = self.instance_set.getOutputNumericValue(i, 0) # print(" 202001-1 self.output_integer_array[ "+str(i)+"]"+ str( self.output_integer_array[i])) # self.output_real_array[i] = self.instance_set.getOutputNumericValue(i, 0) # print("self.output_integer_array[" + str(i) + "] = " + str(self.output_integer_array[i])) self.output_array[ i] = self.instance_set.getOutputNominalValue(i, 0) # print(" 202001-1 self.output_integer_array[ " + str(i) + "]" + str(self.output_integer_array[i])) if self.output_integer_array[i] > self.nclasses: self.nclasses = self.output_integer_array[i] self.nclasses = self.nclasses + 1 print('Number of classes=' + str(self.nclasses)) except Exception as error: print( "read_classification_set: Exception in readSet, in read_classification_set:" + str(error)) # self.computeStatistics() self.compute_instances_per_class() # * It reads the whole input data-set and it stores each example and its associated output value in # * local arrays to ease their use. # * @param datasetFile String name of the file containing the dataset # * @param train boolean It must have the value "true" if we are reading the training data-set # * @throws IOException If there ocurs any problem with the reading of the data-set # added by rui for granularity rule generation def read_classification_set_from_data_row_array(self, data_row_array): self.compute_statistics_data_row_array(data_row_array) self.compute_instances_perclass_data_row_array(data_row_array) def readRegressionSet(self, datasetFile, train, file_path): try: # Load in memory a dataset that contains a regression problem self.instance_set.readSet(datasetFile, train, file_path) self.ndata = self.instance_set.getNumInstances() self.ninputs = self.attributes.getInputNumAttributes() self.nvars = self.ninputs + self.attributes.getOutputNumAttributes( ) # print("In readRegressionSet , self.ndata is : " + str(self.ndata)) # print("In readRegressionSet , self.ninputs is : " + str(self.ninputs)) # print("In readRegressionSet , self.nvars is : " + str(self.nvars)) # outputIntegerheck that there is only one output variable if self.attributes.getOutputNumAttributes() > 1: # print("Out put attribute: ") outPutAttHeader = self.attributes.getOutputAttributesHeader() # print(outPutAttHeader) # print("This algorithm can not process MIMO datasets") # print("All outputs but the first one will be removed") exit(1) noOutputs = False if self.attributes.getOutputNumAttributes() < 1: # print("This algorithm can not process datasets without outputs") # print("Zero-valued output generated") noOutputs = True print("noOutputs = True, exit 1 !!!!!") exit(1) # Initialice and fill our own tables self.x_array = [[0.0 for y in range(self.ninputs)] for x in range(self.ndata)] self.missing_array = [[False for y in range(self.ninputs)] for x in range(self.ndata)] self.output_integer_array = [0 for x in range(self.ndata)] # Maximum and minimum of inputs self.emax_array = [None for x in range(self.ninputs)] self.emin_array = [None for x in range(self.ninputs)] for i in range(0, self.ninputs): self.emax_array[i] = self.attributes.getAttributeByPos( i).getMaxAttribute() self.emin_array[i] = self.attributes.getAttributeByPos( i).getMinAttribute() # All values are casted into double / integer self.nclasses = 0 for i in range(0, self.ndata): inst = self.instance_set.getInstance(i) for j in range(0, self.ninputs): self.x_array[i][ j] = self.instance_set.getInputNumericValue( i, j) # inst.getInputRealValues(j); self.missing_array[i][j] = inst.getInputMissingValues(j) if self.missing_array[i][j]: self.x_array[i][j] = self.emin_array[j] - 1 if noOutputs: print("noOutputs self.output_real_array[i]" + str(i) + "is 0 ") self.output_real_array[i] = 0 self.output_integer_array[i] = 0 else: print("noOutputs else part:") self.output_real_array[ i] = self.instance_set.getOutputNumericValue(i, 0) print("self.output_real_array[i]" + str(i) + str(self.output_real_array[i])) self.output_integer_array[i] = int( self.output_real_array[i]) except OSError as error: print("OS error: {0}".format(error)) except Exception as otherException: # print("DBG: Exception in readSet:", sys.exc_info()[0]) print(" In readRegressionSet other Exception is :" + str(otherException)) self.computeStatistics() # *It copies the header of the dataset # * @ return String A string containing all the data - set information def copy_header(self): p = "" # # print("copyHeader begin...., P is :" + p) p = "@relation " + self.attributes.getRelationName() + "\n" # # print(" after relation P is :" + p) p += self.attributes.getInputAttributesHeader() # # print(" after getInputAttributesHeader P is :" + p) p += self.attributes.getOutputAttributesHeader() # # print(" after getOutputAttributesHeader P is :" + p) p += self.attributes.getInputHeader() + "\n" # # print(" after getInputHeader P is :" + p) p += self.attributes.getOutputHeader() + "\n" # # print(" after getOutputHeader P is :" + p) p += "@data\n" # print("P is :" + p) return p # * It transform the input space into the [0,1] range def normalize(self): atts = self.getn_inputs() maxs = [0.0 for x in range(atts)] for j in range(0, atts): maxs[j] = 1.0 / (self.emax_array[j] - self.emin_array[j]) for i in range(0, self.get_ndata()): for j in range(0, atts): if not self.isMissing( i, j): # this process ignores missing values self.x_array[i][j] = (self.x_array[i][j] - self.__emin[j]) * maxs[j] # * It checks if the data-set has any real value # * @return boolean True if it has some real values, else false. def has_real_attributes(self): return Attributes.hasRealAttributes(self) # * It checks if the data-set has any real value # * @return boolean True if it has some real values, else false. def has_numerical_attributes(self): return Attributes.hasIntegerAttributes( self) or Attributes.hasRealAttributes(self) # * It checks if the data-set has any missing value # * @return boolean True if it has some missing values, else false. def has_missing_attributes(self): return self.size_without_missing() < self.get_ndata() # * It return the size of the data-set without having account the missing values # * @return int the size of the data-set without having account the missing values def size_without_missing(self): tam = 0 # # print("self.ndata is :" + str(self.ndata) + ", self.ninputs :" + str(self.ninputs)) for i in range(0, self.ndata): for j in range(1, self.ninputs): # changed the isMissing condition inside if if self.is_missing(i, j): # print("It is missing value is i = " + str(i) + ",j==" + str(j)) break j = j + 1 # # print("sizeWithoutMissing, i = " + str(i) + ",j==" + str(j)) if j == self.ninputs: tam = tam + 1 # print("tam=" + str(tam)) return tam # * It returns the number of examples # * # * @return the number of examples def size(self): return self.ndata # * It computes the average and standard deviation of the input attributes def compute_statistics(self): try: print("Begin computeStatistics......") var_num = self.get_nvars() print("varNum = " + str(var_num)) self.stdev_array = [ 0.0 for x in range(var_num) ] # original was double ,changed into float in python self.average_array = [0.0 for x in range(var_num)] input_num = self.getn_inputs() data_num = self.get_ndata() print("inputNum = " + str(input_num) + ",dataNum = " + str(data_num)) for i in range(0, input_num): self.average_array[i] = 0 for j in range(0, data_num): if not self.isMissing(j, i): self.average_array[ i] = self.average_array[i] + self.x_array[j][i] if data_num != 0: self.average_array[i] = self.average_array[i] / data_num average_length = len(self.average_array) print(" average_length is " + str(average_length)) self.average_array[average_length - 1] = 0 if len(self.output_real_array) == 0: print("len(self.output_real_array) is 0") else: # print("len(self.output_real_array) is " + str(len(self.output_real_array))) for j in range(0, len(self.output_real_array)): # print("self.output_real_array[j] is : "+str(self.output_real_array[j]) + " ,j is :"+str(j)) self.average_array[average_length - 1] = self.average_array[average_length - 1] + \ self.output_real_array[j] if len(self.output_real_array) != 0: self.average_array[average_length - 1] = self.average_array[ average_length - 1] / len(self.output_real_array) print("before the loop for inputNum") for i in range(0, input_num): sum_value = 0.0 for j in range(0, data_num): if not self.isMissing(j, i): # print("self.isMissing(j, i)==False") sum_value = sum_value + ( self.x_array[j][i] - self.average_array[i]) * ( self.x_array[j][i] - self.average_array[i]) if data_num != 0: # print("dataNum != 0" + " , dataNum=" + str(data_num)) sum_value = sum_value / data_num self.stdev_array[i] = math.sqrt(sum_value) sum_value = 0.0 for j in range(0, len(self.output_real_array)): sum_value += (self.output_real_array[j] - self.average_array[average_length - 1]) * ( self.output_real_array[j] - self.average_array[average_length - 1]) if len(self.output_real_array) != 0: sum_value /= len(self.output_real_array) self.stdev_array[len(self.stdev_array) - 1] = math.sqrt(sum_value) print("sum is :" + str(sum_value) + " self.stdev_array :" + str(self.stdev_array)) except Exception as error: print("Exception in computeStatistics : " + str(error)) # * It return the standard deviation of an specific attribute # * @param position int attribute id (position of the attribute) # * @return double the standard deviation of the attribute def std_dev(self, position): return self.stdev_array[position] # * It return the average of an specific attribute # * @param position int attribute id (position of the attribute) # * @return double the average of the attribute def average(self, position): return self.average_array[position] # *It computes the number of examples per class def compute_instances_per_class(self): # print("compute_instances_per_class begin..., self.nclasses = " + str(self.nclasses)) self.instances_cl = [0 for x in range(self.nclasses)] self.frequent_class_array = [ Decimal(0.0) for x in range(self.nclasses) ] data_num = self.get_ndata() # print("dataNum = " + str(dataNum)) for i in range(0, data_num): integer_in_loop = self.output_integer_array[i] # # print("outputInteger[" + str(i) + "]" + str(integerInLoop)) self.instances_cl[ integer_in_loop] = self.instances_cl[integer_in_loop] + 1 for i in range(0, self.nclasses): if data_num is 0: self.frequent_class_array[i] = 0 else: self.frequent_class_array[i] = (1.0 * self.instances_cl[i] / data_num) # *It returns the number of examples for a given class # * @ param clas int the class label id # * @ return int the number of examples # for the class def number_instances(self, clas): return self.instances_cl[clas] # /** # * It returns the number of labels for a nominal attribute # * @param attribute int the attribute position in the data-set # * @return int the number of labels for the attribute # */ # def number_values(self, attribute): return Attributes.getInputAttribute(attribute).getNumNominalValues( Attributes) # * It returns the class label (string) given a class id (int) # * @param intValue int the class id # * @return String the corrresponding class label # # * It returns the class label (string) given a class id (int) # * @param intValue int the class id # * @return String the corrresponding class label def get_output_value(self, int_value): # # print("Before att get ") att = Attributes.getOutputAttribute(Attributes, 0) # # print("After att get ") return att.getNominalValue(int_value) # * It returns the type of the variable # * @param variable int the variable id # * @return int a code for the type of the variable (INTEGER, REAL or NOMINAL) def get_type(self, variable): if self.attributes.getAttributeByPos( variable).getType() == Attributes.getAttributeByPos(0).INTEGER: return self.INTEGER if self.attributes.getAttributeByPos( variable).getType() == Attributes.getAttributeByPos(0).REAL: return self.REAL if self.attributes.getAttributeByPos( variable).getType() == Attributes.getAttributeByPos(0).NOMINAL: return self.NOMINAL return 0 # * It returns the discourse universe for the input and output variables # * @return double[][] The minimum [0] and maximum [1] range of each variable def set_nvars(self, nvar_value): self.nvars = nvar_value # modified at 2020-08-14 def get_ranges(self): # print("self.get_nvars()" + str(self.get_nvars())) rangos = [[0.0 for y in range(2)] for x in range(self.get_nvars())] # print("rangos has two dimensions, first is self.get_nvars()==" + str(self.getn_inputs()) + ",second is 2") ninputs = self.get_ninputs() for i in range(0, ninputs): # print("self.getn_inputs() is :" + str(nInputs) + " i = " + str(i)) attHere = self.attributes.getInputAttribute(i) # print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues())) if attHere.getNumNominalValues() > 0: rangos[i][0] = 0.0 rangos[i][1] = attHere.getNumNominalValues() - 1 # print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) else: rangos[i][0] = attHere.getMinAttribute() rangos[i][1] = attHere.getMaxAttribute() # print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) # save the output rango in the last range array rangos[self.get_nvars() - 1][0] = self.attributes.getOutputAttribute(0).getMinAttribute() rangos[self.get_nvars() - 1][1] = self.attributes.getOutputAttribute(0).getMaxAttribute() return rangos def get_granularity_zone_ranges(self, data_set_x_array): # print("self.get_nvars()" + str(self.get_nvars())) rangos = [[0.0 for y in range(2)] for x in range(self.get_nvars())] # print("rangos has two dimensions, first is self.get_nvars()==" + str(self.get_nvars()) + ",second is 2") nInputs = self.getn_inputs() for i in range(0, nInputs): # print("self.getn_inputs() is :" + str(nInputs) + " i = " + str(i)) attHere = Attributes.getInputAttribute(Attributes, i) # print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues())) if attHere.getNumNominalValues() > 0: rangos[i][0] = 0.0 rangos[i][1] = attHere.getNumNominalValues() - 1 # print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) else: rangos[i][0] = attHere.get_min_granularity_attribute( data_set_x_array, i) rangos[i][1] = attHere.get_max_granularity_attribute( data_set_x_array, i) # print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) last_min_value = self.attributes.getOutputAttribute( 0).getMinAttribute() last_max_value = self.attributes.getOutputAttribute( 0).getMaxAttribute() # print("The last_min_value is " + str(last_min_value)+" The last_max_value is " + str(last_max_value)) rangos[self.get_nvars() - 1][0] = last_min_value rangos[self.get_nvars() - 1][1] = last_max_value return rangos # * It returns the attribute labels for the input features # * @return String[] the attribute labels for the input features def get_names(self): names = ["" for x in range(self.ninputs)] for i in range(0, self.ninputs): names[i] = self.attributes.getInputAttribute(i).getName() print(" attributes' names[" + str(i) + "]:" + names[i]) return names # * It returns the class labels # * @return String[] the class labels def get_classes(self): clases = ["" for x in range(self.nclasses)] # print(" getClasses,self.nclasses: " + str(self.nclasses)) for i in range(0, self.nclasses): # print(" getClasses method i is "+str(i)) clases[i] = self.attributes.getOutputAttribute(0).getNominalValue( i) return clases def is_nominal(self, index_i): return self.nominal_array[index_i] def is_integer(self, index_i): return self.integer_array[index_i] def get_frequent_class(self, class_value): print("class_value" + str(class_value)) return self.frequent_class_array[class_value] """ * It gets the number of input attributes of the data-set * @return int the number of input attributes of the data-set """ def get_ninputs(self): return self.ninputs """ * It returns the ratio of instances of the given class in the dataset * * @param clas the index of the class * @return the ratio of instances of the given class in the dataset """ def frecuent_class(self, class_value): return self.frequent_class_array[class_value] def get_X(self): return np.array(self.x_array) def get_y(self, type_name='integer'): if type_name == "real": return np.array(self.output_real_array) elif type_name == "integer": return np.array(self.output_integer_array) else: return np.array(self.output_array) def copyHeader(self): p = "" # # print("copyHeader begin...., P is :" + p) p = "@relation " + self.attributes.getRelationName() + "\n" # # print(" after relation P is :" + p) p += self.attributes.getInputAttributesHeader() # # print(" after getInputAttributesHeader P is :" + p) p += self.attributes.getOutputAttributesHeader() # # print(" after getOutputAttributesHeader P is :" + p) p += self.attributes.getInputHeader() + "\n" # # print(" after getInputHeader P is :" + p) p += self.attributes.getOutputHeader() + "\n" # # print(" after getOutputHeader P is :" + p) p += "@data\n" # print("P is :" + p) return p
def readRegressionSet(self, datasetFile, train, file_path): try: # Load in memory a dataset that contains a regression problem self.instance_set.readSet(datasetFile, train, file_path) self.ndata = self.instance_set.getNumInstances() self.ninputs = Attributes.getInputNumAttributes(Attributes) self.nvars = self.ninputs + Attributes.getOutputNumAttributes( Attributes) # print("In readRegressionSet , self.ndata is : " + str(self.ndata)) # print("In readRegressionSet , self.ninputs is : " + str(self.ninputs)) # print("In readRegressionSet , self.nvars is : " + str(self.nvars)) # outputIntegerheck that there is only one output variable if Attributes.getOutputNumAttributes(Attributes) > 1: # print("Out put attribute: ") outPutAttHeader = Attributes.getOutputAttributesHeader( Attributes) # print(outPutAttHeader) # print("This algorithm can not process MIMO datasets") # print("All outputs but the first one will be removed") exit(1) noOutputs = False if Attributes.getOutputNumAttributes(Attributes) < 1: # print("This algorithm can not process datasets without outputs") # print("Zero-valued output generated") noOutputs = True print("noOutputs = True, exit 1 !!!!!") exit(1) # Initialice and fill our own tables self.x_array = [[0.0 for y in range(self.ninputs)] for x in range(self.ndata)] self.missing_array = [[False for y in range(self.ninputs)] for x in range(self.ndata)] self.output_integer_array = [0 for x in range(self.ndata)] # Maximum and minimum of inputs self.emax_array = [None for x in range(self.ninputs)] self.emin_array = [None for x in range(self.ninputs)] for i in range(0, self.ninputs): self.emax_array[i] = Attributes.getAttributeByPos( Attributes, i).getMaxAttribute() self.emin_array[i] = Attributes.getAttributeByPos( Attributes, i).getMinAttribute() # All values are casted into double / integer self.nclasses = 0 for i in range(0, self.ndata): inst = self.instance_set.getInstance(i) for j in range(0, self.ninputs): self.x_array[i][ j] = self.instance_set.getInputNumericValue( i, j) # inst.getInputRealValues(j); self.missing_array[i][j] = inst.getInputMissingValues(j) if self.missing_array[i][j]: self.x_array[i][j] = self.emin_array[j] - 1 if noOutputs: print("noOutputs self.output_real_array[i]" + str(i) + "is 0 ") self.output_real_array[i] = 0 self.output_integer_array[i] = 0 else: print("noOutputs else part:") self.output_real_array[ i] = self.instance_set.getOutputNumericValue(i, 0) print("self.output_real_array[i]" + str(i) + str(self.output_real_array[i])) self.output_integer_array[i] = int( self.output_real_array[i]) except OSError as error: print("OS error: {0}".format(error)) except Exception as otherException: # print("DBG: Exception in readSet:", sys.exc_info()[0]) print(" In readRegressionSet other Exception is :" + str(otherException)) self.computeStatistics()
def insertAttribute(self, line): # print("Insert attribute begin :") indexL = 0 indexR = 0 type = "" # Treating string and declaring a string tokenizer if "{" in line: token_str = "{" elif "[" in line: token_str = "[" token_withT = "\t" + token_str line = line.replace(token_str, token_withT) # print("token_double is:" + token_withT + ", line is :" + line) # System.out.println (" > Processing line: "+ line ); # st = line.split(" [{\t"); st = line.split( "\t") # first we need to split the attribute line into two part , attribute name and attribute values # Disregarding the first token. It is @attribute st[0] = st[0].replace("@attribute", "").strip() # delete @attribute # print("st[0] is:" + st[0]) first_part = st[0].split() at = Attribute() # print("Get type once get instance object, at.getType() = " + str(type_string)) at.setName(first_part[0]) print("att set name as first_part[0] is:" + first_part[0]) # # print( "Attribute name: "+ at.getName() ) # to get the class name values we need to split the second part of the attribute line, to get values of attribute # Next action depends on the type of attribute: continuous or nominal if len(st) == 1: # Parsing a nominal attribute with no definition of values # print("Parsing nominal attribute without values: setType=0") # print("Get type =" + at.getType()) at.setType(Attribute.NOMINAL) elif "{" in line: # this because it is the class values line # print("Parsing nominal attribute with values: " + line) # print("Get type =" + at.getType()) # print("Before setType = 0") at.setType(Attribute.NOMINAL) # print("after setType= 0") at.setFixedBounds(True) indexL = line.index("{") + 1 # print("indexL: " + indexL ) indexR = line.index("}") # print("indexR: " + str(indexR)) print("indexL : " + str(indexL) + "indexR : " + str(indexR)) # print( "The Nominal values are: " + line[indexL: indexR]); lineSub = line[indexL: indexR] # print("The lineSub : " + lineSub) st2 = lineSub.split(",") for nominalStr in st2: at.addNominalValue(nominalStr.strip()) else: # Parsing an integer or real attType = first_part[1].lower() # print("attribute Name : " + str(first_part[0]) + ", attribute type = " + str(attType)) # System.out.println (" > Parsing "+ type + " attributes"); if attType == "integer": at.setType(Attribute.INTEGER) # print("set integer type") if attType == "real": at.setType(Attribute.REAL) # print("set real type") indexL = line.index("[") indexR = line.index("]") # print("indexL is: " + str(indexL) + " indexR: " + str(indexR)) if indexL != -1 and indexR != - 1: # System.out.println ( " > The real values are: " + line.substring( indexL+1, indexR) ); lineSub = line[indexL + 1: indexR] # print("lineSub: " + lineSub) st2 = lineSub.split(",") # print("st2[0].strip() :" + st2[0]) # print("st2[1].strip() :" + st2[1]) minBound = float(st2[0].strip()) maxBound = float(st2[1].strip()) # print("Before at.setBounds(minBound, maxBound): ( " + str(minBound) + " , " + str(maxBound) + " )") at.setBounds(minBound, maxBound) # print("Before add attribute :::: ") Attributes.addAttribute(Attributes, at)