def insertInputOutput(self,line, lineCount, collection, type, isTrain): print(" processing insertInputOutput: " + line) # Declaring StringTokenizer st = line.split(",") for attName in st: attName = str(attName.strip()) print("attrName: " + attName) attrItem=Attributes.getAttributeByName(Attributes,attName) attributes=Attributes.getAttributes(Attributes) for att in attributes: print("att name is :"+str(att.getName())) #print("numbers of items that attributes:"+str(len(attributes))) if ( attrItem == None): print("Attributes.getAttribute == None") # If this attribute has not been declared, generate error er = ErrorInfo(ErrorInfo.InputTestAttributeNotDefined, 0, lineCount, 0, 0, isTrain, ("The attribute " + attName + " defined in @" + type + " in test, it has not been defined in @inputs in its train dataset. It will be ignored")) InstanceSet.errorLogger.setError(er) else: for itemCollection in collection: print("Item in collection is "+itemCollection) print("Attributes.getAttribute != None") print(" > " + str(type) + " attribute considered: " + attName) if(attName not in collection): print("attName:" +attName+" is not in collection") collection.append(attName)
def getNewHeader(self): line = "" attrs = [] # Getting the relation name and the attributes if (self.storeAttributesAsNonStatic == True and self.attributes != None): line = "@relation " + self.attributes.getRelationName() + "\n" attrs = self.attributes.getInputAttributes(Attributes) else: line = "@relation " + Attributes.getRelationName() + "\n" attrs = Attributes.getInputAttributes(Attributes) for i in range(0, attrs.length): line += attrs[i].toString() + "\n" # Gettin all the outputs attributes if (self.storeAttributesAsNonStatic and self.attributes != None): attrs = self.attributes.getOutputAttributes() line += attrs[0].toString() + "\n" # Getting @inputs and @outputs line += self.attributes.getInputHeader() + "\n" line += self.attributes.getOutputHeader() + "\n" else: attrs = Attributes.getOutputAttributes() line += str(attrs[0]) + "\n" # Getting @inputs and @outputs line += Attributes.getInputHeader() + "\n" line += Attributes.getOutputHeader() + "\n" return line
def getRanges(self): print("self.getnVars()" + str(self.getnVars())) m = int(self.getnVars()) rangos = [[0.0 for y in range(2)] for x in range(m)] print("rangos has two dimensions, first is self.getnVars()==" + str(self.getnVars()) + ",second is 2") for i in range(0, self.getnInputs()): print("self.getnInputs()" + str(self.getnInputs()) + " i = " + str(i)) attHere = Attributes.getInputAttribute(Attributes, i) print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues())) if (attHere.getNumNominalValues() > 0): rangos[i][0] = 0.0 rangos[i][1] = attHere.getNumNominalValues() - 1 print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) else: rangos[i][0] = attHere.getMinAttribute() rangos[i][1] = attHere.getMaxAttribute() print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1])) rangos[self.getnVars() - 1][0] = Attributes.getOutputAttribute( Attributes, 0).getMinAttribute() rangos[self.getnVars() - 1][1] = Attributes.getOutputAttribute( Attributes, 0).getMaxAttribute() return rangos
def printAsOriginal(self,out, int): # Printing the header as the original one print(self.header) if (self.storeAttributesAsNonStatic and self.attributes != None): if (self.printInOut == 1 or self.printInOut == 3): print(self.attributes.getInputHeader()) if (self.printInOut == 2 or self.printInOut == 3): print(self.attributes.getOutputHeader()) else: if (self.printInOut == 1 or self.printInOut == 3): out.println(Attributes.getInputHeader()) if (self.printInOut == 2 or self.printInOut == 3): out.println(Attributes.getOutputHeader()) print("@data") for i in range(0, len(self.instanceSet)): print() if (self.storeAttributesAsNonStatic and self.attributes != None): self.instanceSet[i].printAsOriginal(self.attributes, out) else: self.instanceSet[i].printAsOriginal(out)
def getClasses(self): clases = ["" for x in range(self.__nClasses)] print(" getClasses,self.__nClasses: " + str(self.__nClasses)) for i in range(0, self.__nClasses): clases[i] = Attributes.getOutputAttribute(Attributes, 0).getNominalValue(i) return clases
def printInsSet(self): print("------------- ATTRIBUTES --------------") if (self.storeAttributesAsNonStatic and self.attributes != None): self.attributes.printAttributes(); else: Attributes.printAttributes() print("-------------- INSTANCES --------------") for i in range(0, self.instanceSet.length): print("\n> Instance " + str(i) + ":") if (self.storeAttributesAsNonStatic and self.attributes != None): self.instanceSet[i].printInsSet(self.attributes) else: self.instanceSet[i].printInsSet()
def getOriginalHeaderWithoutInOut(self): line = "" attrs = [] # Getting the relation name and the attributes if (self.storeAttributesAsNonStatic and self.attributes != None): line = "@relation " + self.attributes.getRelationName() + "\n" attrs = self.attributes.getAttributes() else: line = "@relation " + Attributes.getRelationName() + "\n" attrs = Attributes.getAttributes() for i in range(0, len(attrs)): line = line + str(attrs[i]) + "\n" return line
def copyHeader(self): p = "" print("copyHeader begin...., P is :" + p) p = "@relation " + Attributes.getRelationName(Attributes) + "\n" print(" after relation P is :" + p) p += Attributes.getInputAttributesHeader(Attributes) print(" after getInputAttributesHeader P is :" + p) p += Attributes.getOutputAttributesHeader(Attributes) print(" after getOutputAttributesHeader P is :" + p) p += Attributes.getInputHeader(Attributes) + "\n" print(" after getInputHeader P is :" + p) p += Attributes.getOutputHeader(Attributes) + "\n" print(" after getOutputHeader P is :" + p) p += "@data\n" print("P is :" + p) return p
def removeAttribute(self, tSet, inputAtt, whichAtt): attToDel = None # Getting a reference to the attribute to del if (inputAtt == True): if (self.storeAttributesAsNonStatic and self.attributes != None): attToDel = self.attributes.getInputAttribute(whichAtt) else: attToDel = Attributes.getInputAttribute(whichAtt) else: if (self.storeAttributesAsNonStatic and self.attributes != None): attToDel = self.attributes.getOutputAttribute(whichAtt) else: attToDel = Attributes.getOutputAttribute(whichAtt) if (self.storeAttributesAsNonStatic == True and self.attributes != None): print("Removing the attribute") if (self.attributes.removeAttribute(inputAtt, whichAtt) == False or (tSet != None and tSet.attributes.removeAttribute(inputAtt, whichAtt)) == False): return False else: if (Attributes.removeAttribute(inputAtt, whichAtt) == False): return False for i in range(0, len(self.instanceSet)): if (self.storeAttributesAsNonStatic and self.attributes != None): self.instanceSet[i].removeAttribute(self.attributes, attToDel, inputAtt, whichAtt) else: self.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) if (tSet != None): for i in range(0, tSet.instanceSet.length): if (self.storeAttributesAsNonStatic == True and self.attributes != None): tSet.instanceSet[i].removeAttribute(self.attributes, attToDel, inputAtt, whichAtt) else: tSet.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt) return True
def getType( self,variable) : if (Attributes.getAttributeByPos(self,variable).getType() == Attributes.getAttributeByPos(Attributes,0).INTEGER): return self.INTEGER if (Attributes.getAttributeByPos(self,variable).getType() == Attributes.getAttributeByPos(Attributes,0).REAL): return self.REAL if (Attributes.getAttributeByPos(self,variable).getType() == Attributes.getAttributeByPos(Attributes,0).NOMINAL): return self.NOMINAL return 0
def processInputsAndOutputs(self,isTrain, inputsDef, outputsDef, outputAttrNames, inputAttrNames): # After parsing the header, the inputs and the outputs are prepared. print("Processing inputs and outputs") self.outputInfered = False #set default value if (isTrain == True): print("isTrain == True") if inputsDef == False and outputsDef == False: print("is neither inputAtt no outputAtt") posHere = Attributes.getNumAttributes(self) - 1 outputAttrNames.append(Attributes.getAttributeByPos(self,posHere).getName()) inputAttrNames = Attributes.getAttributesExcept(Attributes,outputAttrNames) self.outputInfered = True elif (inputsDef == False and outputsDef == True): print("inputsDef == False and outputsDef == True") inputAttrNames = Attributes.getAttributesExcept(Attributes,outputAttrNames) elif (inputsDef == True and outputsDef == False): print("inputsDef == True and outputsDef == False") outputAttrNames = Attributes.getAttributesExcept(Attributes,inputAttrNames) self.outputInfered = True print("setOutputInputAttributes begin: ") Attributes.setOutputInputAttributes(Attributes,inputAttrNames, outputAttrNames)
def readRegressionSet(self, datasetFile, train): try: #Load in memory a dataset that contains a regression problem self.__instanceSet.readSet(datasetFile, train) self.__nData = self.__instanceSet.getNumInstances() self.__nInputs = Attributes.getInputNumAttributes(Attributes) self.__nVars = self.__nInputs + Attributes.getOutputNumAttributes( Attributes) print("In readRegressionSet , self.__nData is : " + str(self.__nData)) print("In readRegressionSet , self.__nInputs is : " + str(self.__nInputs)) print("In readRegressionSet , self.__nVars is : " + str(self.__nVars)) #outputIntegerheck that there is only one output variable if (Attributes.getOutputNumAttributes(Attributes) > 1): print("Out put attribute: ") outPutAttHeader = Attributes.getOutputAttributesHeader( Attributes) print(outPutAttHeader) print("This algorithm can not process MIMO datasets") print("All outputs but the first one will be removed") exit(1) noOutputs = False if (Attributes.getOutputNumAttributes(Attributes) < 1): print( "This algorithm can not process datasets without outputs") print("Zero-valued output generated") noOutputs = True exit(1) # Initialice and fill our own tables self.__X = [[0.0 for y in range(self.__nInputs)] for x in range(self.__nData)] self.__missing = [[False for y in range(self.__nInputs)] for x in range(self.__nData)] self.__outputInteger = [0 for x in range(self.__nData)] # Maximum and minimum of inputs self.__emax = [None for x in range(self.__nInputs)] self.__emin = [None for x in range(self.__nInputs)] for i in range(0, self.__nInputs): self.__emax[i] = Attributes.getAttributeByPos( Attributes, i).getMaxAttribute() self.__emin[i] = Attributes.getAttributeByPos( Attributes, i).getMinAttribute() # All values are casted into double / integer self.__nClasses = 0 for i in range(0, self.__nData): inst = self.__instanceSet.getInstance(i) for j in range(0, self.__nInputs): self.__X[i][j] = self.__instanceSet.getInputNumericValue( i, j) #inst.getInputRealValues(j); self.__missing[i][j] = inst.getInputMissingValues(j) if (self.__missing[i][j]): self.__X[i][j] = self.__emin[j] - 1 if (noOutputs): self.__outputReal[i] = 0 self.__outputInteger[i] = 0 else: self.__outputReal[ i] = self.__instanceSet.getOutputNumericValue(i, 0) self.__outputInteger[i] = int(self.__outputReal[i]) except OSError as error: print("OS error: {0}".format(error)) except Exception as otherException: print("DBG: Exception in readSet:", sys.exc_info()[0]) print(" In readRegressionSet other Exception is :" + str(otherException)) self.computeStatistics()
def insertAttribute(self,line): print("Insert attribute begin :") indexL = 0 indexR = 0 attType = "" # Treating string and declaring a string tokenizer if "{" in line: token_str = "{" elif "[" in line: token_str = "[" token_withT= "\t" + token_str line=line.replace(token_str,token_withT) print("token_double is:" + token_withT + ", line is :" + line) # System.out.println (" > Processing line: "+ line ); #st = line.split(" [{\t"); st = line.split("\t")# first we need to split the attribute line into two part , attribute name and attribute values print("word in st are as below: ") for word in st: print(word) # Disregarding the first token. It is @attribute st[0] = st[0].replace("@attribute","").strip() # delete @attribute print("st[0] is:" + st[0]) first_part=st[0].split() at = Attribute() # print("Get type once get instance object, at.getType() = " + str(type_string)) at.setName(first_part[0]) print("att set name as first_part[0] is:" + first_part[0]) # print( "Attribute name: "+ at.getName() ) # to get the class name values we need to split the second part of the attribute line, to get values of attribute # Next action depends on the type of attribute: continuous or nominal if (len(st)==1): # Parsing a nominal attribute with no definition of values print("Parsing nominal attribute without values: setType=0") #print("Get type =" + at.getType()) at.setType(Attribute.NOMINAL) elif ( "{" in line): # this because it is the class values line print("Parsing nominal attribute with values: "+line ) #print("Get type =" + at.getType()) print("Before setType = 0") at.setType(Attribute.NOMINAL) print("after setType= 0") at.setFixedBounds(True) indexL = line.index("{")+1 #print("indexL: " + indexL ) indexR = line.index("}") #print("indexR: " + str(indexR)) print("indexL : " + str(indexL)+"indexR : " +str(indexR)) #print( "The Nominal values are: " + line[indexL: indexR]); lineSub = line[indexL: indexR] print("The lineSub : " + lineSub) st2 = lineSub.split(",") for nominalStr in st2: at.addNominalValue(nominalStr.strip()) else: # Parsing an integer or real attType = first_part[1].lower() print("attribute Name : " + str(first_part[0]) + ", attribute type = "+ str(attType)) # System.out.println (" > Parsing "+ type + " attributes"); if (attType == "integer"): at.setType(Attribute.INTEGER) print("set integer type") if (attType == "real"): at.setType(Attribute.REAL) print("set real type") indexL = line.index("[") indexR = line.index("]") print("indexL is: "+ str(indexL)+ " indexR: "+str(indexR)) if (indexL !=-1 and indexR !=- 1): # System.out.println ( " > The real values are: " + line.substring( indexL+1, indexR) ); lineSub = line[indexL + 1: indexR] print("lineSub: " + lineSub) st2 = lineSub.split(",") print("st2[0].strip() :" + st2[0]) print("st2[1].strip() :"+st2[1]) minBound = float(st2[0].strip()) maxBound = float(st2[1].strip()) print("Before at.setBounds(minBound, maxBound): ( "+ str(minBound) + " , " + str(maxBound) + " )") at.setBounds(minBound, maxBound) print("Before add attribute :::: ") Attributes.addAttribute(Attributes,at) print("insertAttribute is finished :::: ")
def hasRealAttributes(self): return Attributes.hasRealAttributes(self)
def readSet(self,fileName, isTrain): print("Before try in readSet of InstanceSet, fileName is :" + str(fileName) + ".") print("Opening the file in readSet of InstanceSet: " + str(fileName) + ".") try: # Parsing the header of the DB. errorLogger = FormatErrorKeeper() self.file_to_open=self.data_folder +"\\" + self.path_name +"\\"+ fileName # Declaring an instance parser print("In readSet,file_to_open is:"+ str(self.file_to_open)) # to do The exception in init InstanceParserof InstanceParse is: can only concatenate str (not "WindowsPath") to str instance_parser = InstanceParser(self.file_to_open, isTrain) # Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs print("In readSet finished read file " + str(self.file_to_open)) self.parseHeader(instance_parser, isTrain) print(" The number of output attributes is: " + str(Attributes.getOutputNumAttributes(Attributes))) # The attributes statistics are init if we are in train mode. print("In readSet, isTrain is " + str(isTrain)) if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: print("Begin Attributes.initStatistics......") Attributes.initStatistics(Attributes) # A temporal vector is used to store the instances read. print("Reading the data") tempSet = [] print("begin instance_parser.getLines()...... ") lines = self.data_lines new_data_lines=[] print("********* There are : "+ str(len(lines))+ "In original Data lines ********* ") for line in lines: if ("@relation" not in line) and ("@attribute" not in line) and ("@inputs" not in line) and ("@outputs" not in line) and ("@data" not in line): new_data_lines.append(line) print("********* There are : " + str(len(new_data_lines)) + " In new Data lines ********* ") for line in new_data_lines : if(new_data_lines is not None): print( "Data line: " + str(line)) newInstance = Instance() print("tempSet that pass to setThreeParameters is: " + str(len(tempSet))) newInstance.setThreeParameters(line, isTrain, len(tempSet)) tempSet.append(newInstance) # The vector of instances is converted to an array of instances. sizeInstance = len(tempSet) print(" Number of instances read: " + str(sizeInstance)) self.instanceSet = [] for i in range(0, sizeInstance): self.instanceSet.append (tempSet[i]) print("After converting all instances") # System.out.println("The error logger has any error: "+errorLogger.getNumErrors()); if self.errorLogger.getNumErrors() > 0: errorNumber =len(errorLogger.getAllErrors()) print("There has been " + str(errorNumber) + "errors in the Dataset format.") for k in range(0, errorLogger.getNumErrors()): errorLogger.getError(k).printErrorInfo() #print("There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format", # errorLogger.getAllErrors()); print("Finishing the statistics: (isTrain)" + str(isTrain) + ", (# out attributes)" + str(Attributes.getOutputNumAttributes(Attributes))) # # If being on a train dataset, the statistics are finished if (isTrain and Attributes.getOutputNumAttributes(Attributes) == 1): Attributes.finishStatistics(Attributes) # # close the stream instance_parser.close() print("File LOADED CORRECTLY!!") except Exception as e : print("Unexpected error in readSet of InstanceSet class :" + str(e))
def parseHeader(self,parser, isTrain): # 1. Declaration of variables inputAttrNames = [] outputAttrNames = [] inputsDef = False outputsDef = False self.header = "" attCount = 0 lineCount = 0 self.attHeader = None print("Begin to call the InstanceParser.getLines(),parser.getLines(), in InstanceSet.") lines = parser.getLines() self.data_lines = lines for line in lines: line = str(line).strip() print("In parseHeader method of InstanceSet, the line is:" + line) if (line=="@data".lower()): break else: print(" Line read: " + line +"." ) lineCount =lineCount + 1 if ("@relation" in line): if (isTrain): relationName = str(line.replace("@relation", "")).strip() print("set Relation name :" + str(relationName)) Attributes.setRelationName(self,relationName) elif ("@attribute" in line): if (isTrain): print("Begin insertAttribute ......") self.insertAttribute(line) attCount = attCount + 1 elif ("@inputs" in line): print("@inputs in "+str(line)) self.attHeader = self.header inputsDef = True aux = line[8:] if (isTrain): print("Has @inputs, aux is :" + aux) self.insertInputOutput(aux, lineCount, inputAttrNames, "inputs", isTrain) elif ("@outputs" in line ): if (self.attHeader == None): self.attHeader = self.header outputsDef = True print( "Defining the output in line :"+ line) sub_line=line.split()# To get the output attribute name aux = sub_line[1] if (isTrain): print("Has @outputs, aux is :" + aux) self.insertInputOutput(aux, lineCount, outputAttrNames, "outputs", isTrain) print("Size of the output is: " + str(len(outputAttrNames))) self.header += line + "\n" if (self.attHeader == None): self.attHeader = self.header self.processInputsAndOutputs(isTrain, inputsDef, outputsDef, outputAttrNames, inputAttrNames)
def readClassificationSet(self, datasetFile, train): try: # Load in memory a dataset that contains a classification problem print("Inside readClassificationSet, datasetFile :" + str(datasetFile)) print("train is :" + str(train)) print("object instanceSet is :" + str(self.__instanceSet)) if (self.__instanceSet is None): print("self.__instanceSet is Null") else: print("self.__instanceSet is not None, train = " + str(train)) self.__instanceSet.readSet(datasetFile, train) print("begin getNumInstances ...... in readClassificationSet ") self.__nData = self.__instanceSet.getNumInstances() print("In readClassificationSet , self.__nData is : " + str(self.__nData)) self.__nInputs = Attributes.getInputNumAttributes(Attributes) print("In readClassificationSet , self.__nInputs is : " + str(self.__nInputs)) self.__nVars = self.__nInputs + Attributes.getOutputNumAttributes( Attributes) print("In readClassificationSet , self.__nVars is : " + str(self.__nVars)) # outputIntegerheck that there is only one output variable if (Attributes.getOutputNumAttributes(Attributes) > 1): outAttrs = Attributes.getOutputAttributes(Attributes) print("Output Attributes number is bigger than 1") for outAtt in outAttrs: i = 1 print("Att" + str(i) + str(outAtt.getName())) i += 1 print("" + Attributes.getOutputAttributesHeader(Attributes)) print("This algorithm can not process MIMO datasets") print("All outputs but the first one will be removed") exit(1) noOutputs = False if (Attributes.getOutputNumAttributes(Attributes) < 1): print( "This algorithm can not process datasets without outputs" ) print("Zero-valued output generated") noOutputs = True exit(1) print("define all the array in MyDataSet class......") #Initialice and fill our own tables print("The two dimension array X, dimension 1 is :" + str(self.__nData) + " ,Dimension 2 is :" + str(self.__nInputs)) nDataLength = self.__nData nInputLength = self.__nInputs print("nDataLength = " + str(nDataLength)) print("nInputLength = " + str(nInputLength)) #[[0 for j in range(m)] for i in range(n)] first column, then row self.__X = [[None for y in range(nInputLength)] for x in range(nDataLength)] self.__y = [None for x in range(nDataLength)] self.__missing = [[None for y in range(nInputLength)] for x in range(nDataLength)] self.__outputInteger = [None for x in range(nDataLength)] self.__outputReal = [None for x in range(nDataLength)] self.__output = ["" for x in range(nDataLength)] # Maximum and minimum of inputs self.emax = [0.0 for x in range(nInputLength)] self.emin = [0.0 for x in range(nInputLength)] for n in range(0, nInputLength): self.emax[n] = Attributes.getAttributeByPos( Attributes, n).getMaxAttribute() self.emin[n] = Attributes.getAttributeByPos( Attributes, n).getMinAttribute() print("self.emax[n]:" + str(self.emax[n])) print("self.emin[n]:" + str(self.emin[n])) # All values are casted into double/integer self.__nClasses = 0 for i in range(0, nDataLength): inst = self.__instanceSet.getInstance(i) # add class y from instance to y array here self.__y[i] = self.__instanceSet.getInstance(i).y_class for j in range(0, nInputLength): input_Numeric_Value = self.__instanceSet.getInputNumericValue( i, j) print("self.__X [i] = " + str(i) + ",[j] = " + str(j) + ",input_Numeric_Value:" + str(input_Numeric_Value)) self.__X[i][ j] = input_Numeric_Value #inst.getInputRealValues(j); print("after get self.__X[i][j]") self.__missing[i][ j] = inst.getInputMissingValuesWithPos(j) print("after self.__missing[i][j]") if (self.__missing[i][j]): self.__X[i][j] = self.emin[j] - 1 if noOutputs: print("noOutputs==True") self.__outputInteger[i] = 0 self.__output[i] = "" else: print("noOutputs==False") self.__outputInteger[ i] = self.__instanceSet.getOutputNumericValue( i, 0) print("self.__outputInteger[" + str(i) + "] = " + str(self.__outputInteger[i])) self.__output[ i] = self.__instanceSet.getOutputNominalValue( i, 0) if (self.__outputInteger[i] > self.__nClasses): self.__nClasses = self.__outputInteger[i] self.__nClasses = self.__nClasses + 1 print('Number of classes=' + str(self.__nClasses)) except Exception as error: print( "readClassificationSet: Exception in readSet, in readClassificationSet:" + str(error)) self.computeStatistics() self.computeInstancesPerClass()
def getNames(self): nombres = ["" for x in range(self.__nInputs)] for i in range(0, self.__nInputs): nombres[i] = Attributes.getInputAttribute(Attributes, i).getName() return nombres
def hasNumericalAttributes(self): return (Attributes.hasIntegerAttributes(self) or Attributes.hasRealAttributes(self))
def getOutputValue(self, intValue): print("Before att get ") att = Attributes.getOutputAttribute(Attributes, 0) print("After att get ") return att.getNominalValue(intValue)
def numberValues(self, attribute): return Attributes.getInputAttribute( self, attribute).getNumNominalValues(Attributes)