def readRegressionSet(self, datasetFile, train): try: #Load in memory a dataset that contains a regression problem self.__instanceSet.readSet(datasetFile, train) self.__nData = self.__instanceSet.getNumInstances() self.__nInputs = Attributes.getInputNumAttributes(Attributes) self.__nVars = self.__nInputs + Attributes.getOutputNumAttributes( Attributes) print("In readRegressionSet , self.__nData is : " + str(self.__nData)) print("In readRegressionSet , self.__nInputs is : " + str(self.__nInputs)) print("In readRegressionSet , self.__nVars is : " + str(self.__nVars)) #outputIntegerheck that there is only one output variable if (Attributes.getOutputNumAttributes(Attributes) > 1): print("Out put attribute: ") outPutAttHeader = Attributes.getOutputAttributesHeader( Attributes) print(outPutAttHeader) print("This algorithm can not process MIMO datasets") print("All outputs but the first one will be removed") exit(1) noOutputs = False if (Attributes.getOutputNumAttributes(Attributes) < 1): print( "This algorithm can not process datasets without outputs") print("Zero-valued output generated") noOutputs = True exit(1) # Initialice and fill our own tables self.__X = [[0.0 for y in range(self.__nInputs)] for x in range(self.__nData)] self.__missing = [[False for y in range(self.__nInputs)] for x in range(self.__nData)] self.__outputInteger = [0 for x in range(self.__nData)] # Maximum and minimum of inputs self.__emax = [None for x in range(self.__nInputs)] self.__emin = [None for x in range(self.__nInputs)] for i in range(0, self.__nInputs): self.__emax[i] = Attributes.getAttributeByPos( Attributes, i).getMaxAttribute() self.__emin[i] = Attributes.getAttributeByPos( Attributes, i).getMinAttribute() # All values are casted into double / integer self.__nClasses = 0 for i in range(0, self.__nData): inst = self.__instanceSet.getInstance(i) for j in range(0, self.__nInputs): self.__X[i][j] = self.__instanceSet.getInputNumericValue( i, j) #inst.getInputRealValues(j); self.__missing[i][j] = inst.getInputMissingValues(j) if (self.__missing[i][j]): self.__X[i][j] = self.__emin[j] - 1 if (noOutputs): self.__outputReal[i] = 0 self.__outputInteger[i] = 0 else: self.__outputReal[ i] = self.__instanceSet.getOutputNumericValue(i, 0) self.__outputInteger[i] = int(self.__outputReal[i]) except OSError as error: print("OS error: {0}".format(error)) except Exception as otherException: print("DBG: Exception in readSet:", sys.exc_info()[0]) print(" In readRegressionSet other Exception is :" + str(otherException)) self.computeStatistics()
def readSet(self,fileName, isTrain): print("Before try in readSet of InstanceSet, fileName is :" + str(fileName) + ".") print("Opening the file in readSet of InstanceSet: " + str(fileName) + ".") try: # Parsing the header of the DB. errorLogger = FormatErrorKeeper() self.file_to_open=self.data_folder +"\\" + self.path_name +"\\"+ fileName # Declaring an instance parser print("In readSet,file_to_open is:"+ str(self.file_to_open)) # to do The exception in init InstanceParserof InstanceParse is: can only concatenate str (not "WindowsPath") to str instance_parser = InstanceParser(self.file_to_open, isTrain) # Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs print("In readSet finished read file " + str(self.file_to_open)) self.parseHeader(instance_parser, isTrain) print(" The number of output attributes is: " + str(Attributes.getOutputNumAttributes(Attributes))) # The attributes statistics are init if we are in train mode. print("In readSet, isTrain is " + str(isTrain)) if isTrain and Attributes.getOutputNumAttributes(Attributes) == 1: print("Begin Attributes.initStatistics......") Attributes.initStatistics(Attributes) # A temporal vector is used to store the instances read. print("Reading the data") tempSet = [] print("begin instance_parser.getLines()...... ") lines = self.data_lines new_data_lines=[] print("********* There are : "+ str(len(lines))+ "In original Data lines ********* ") for line in lines: if ("@relation" not in line) and ("@attribute" not in line) and ("@inputs" not in line) and ("@outputs" not in line) and ("@data" not in line): new_data_lines.append(line) print("********* There are : " + str(len(new_data_lines)) + " In new Data lines ********* ") for line in new_data_lines : if(new_data_lines is not None): print( "Data line: " + str(line)) newInstance = Instance() print("tempSet that pass to setThreeParameters is: " + str(len(tempSet))) newInstance.setThreeParameters(line, isTrain, len(tempSet)) tempSet.append(newInstance) # The vector of instances is converted to an array of instances. sizeInstance = len(tempSet) print(" Number of instances read: " + str(sizeInstance)) self.instanceSet = [] for i in range(0, sizeInstance): self.instanceSet.append (tempSet[i]) print("After converting all instances") # System.out.println("The error logger has any error: "+errorLogger.getNumErrors()); if self.errorLogger.getNumErrors() > 0: errorNumber =len(errorLogger.getAllErrors()) print("There has been " + str(errorNumber) + "errors in the Dataset format.") for k in range(0, errorLogger.getNumErrors()): errorLogger.getError(k).printErrorInfo() #print("There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format", # errorLogger.getAllErrors()); print("Finishing the statistics: (isTrain)" + str(isTrain) + ", (# out attributes)" + str(Attributes.getOutputNumAttributes(Attributes))) # # If being on a train dataset, the statistics are finished if (isTrain and Attributes.getOutputNumAttributes(Attributes) == 1): Attributes.finishStatistics(Attributes) # # close the stream instance_parser.close() print("File LOADED CORRECTLY!!") except Exception as e : print("Unexpected error in readSet of InstanceSet class :" + str(e))
def readClassificationSet(self, datasetFile, train): try: # Load in memory a dataset that contains a classification problem print("Inside readClassificationSet, datasetFile :" + str(datasetFile)) print("train is :" + str(train)) print("object instanceSet is :" + str(self.__instanceSet)) if (self.__instanceSet is None): print("self.__instanceSet is Null") else: print("self.__instanceSet is not None, train = " + str(train)) self.__instanceSet.readSet(datasetFile, train) print("begin getNumInstances ...... in readClassificationSet ") self.__nData = self.__instanceSet.getNumInstances() print("In readClassificationSet , self.__nData is : " + str(self.__nData)) self.__nInputs = Attributes.getInputNumAttributes(Attributes) print("In readClassificationSet , self.__nInputs is : " + str(self.__nInputs)) self.__nVars = self.__nInputs + Attributes.getOutputNumAttributes( Attributes) print("In readClassificationSet , self.__nVars is : " + str(self.__nVars)) # outputIntegerheck that there is only one output variable if (Attributes.getOutputNumAttributes(Attributes) > 1): outAttrs = Attributes.getOutputAttributes(Attributes) print("Output Attributes number is bigger than 1") for outAtt in outAttrs: i = 1 print("Att" + str(i) + str(outAtt.getName())) i += 1 print("" + Attributes.getOutputAttributesHeader(Attributes)) print("This algorithm can not process MIMO datasets") print("All outputs but the first one will be removed") exit(1) noOutputs = False if (Attributes.getOutputNumAttributes(Attributes) < 1): print( "This algorithm can not process datasets without outputs" ) print("Zero-valued output generated") noOutputs = True exit(1) print("define all the array in MyDataSet class......") #Initialice and fill our own tables print("The two dimension array X, dimension 1 is :" + str(self.__nData) + " ,Dimension 2 is :" + str(self.__nInputs)) nDataLength = self.__nData nInputLength = self.__nInputs print("nDataLength = " + str(nDataLength)) print("nInputLength = " + str(nInputLength)) #[[0 for j in range(m)] for i in range(n)] first column, then row self.__X = [[None for y in range(nInputLength)] for x in range(nDataLength)] self.__y = [None for x in range(nDataLength)] self.__missing = [[None for y in range(nInputLength)] for x in range(nDataLength)] self.__outputInteger = [None for x in range(nDataLength)] self.__outputReal = [None for x in range(nDataLength)] self.__output = ["" for x in range(nDataLength)] # Maximum and minimum of inputs self.emax = [0.0 for x in range(nInputLength)] self.emin = [0.0 for x in range(nInputLength)] for n in range(0, nInputLength): self.emax[n] = Attributes.getAttributeByPos( Attributes, n).getMaxAttribute() self.emin[n] = Attributes.getAttributeByPos( Attributes, n).getMinAttribute() print("self.emax[n]:" + str(self.emax[n])) print("self.emin[n]:" + str(self.emin[n])) # All values are casted into double/integer self.__nClasses = 0 for i in range(0, nDataLength): inst = self.__instanceSet.getInstance(i) # add class y from instance to y array here self.__y[i] = self.__instanceSet.getInstance(i).y_class for j in range(0, nInputLength): input_Numeric_Value = self.__instanceSet.getInputNumericValue( i, j) print("self.__X [i] = " + str(i) + ",[j] = " + str(j) + ",input_Numeric_Value:" + str(input_Numeric_Value)) self.__X[i][ j] = input_Numeric_Value #inst.getInputRealValues(j); print("after get self.__X[i][j]") self.__missing[i][ j] = inst.getInputMissingValuesWithPos(j) print("after self.__missing[i][j]") if (self.__missing[i][j]): self.__X[i][j] = self.emin[j] - 1 if noOutputs: print("noOutputs==True") self.__outputInteger[i] = 0 self.__output[i] = "" else: print("noOutputs==False") self.__outputInteger[ i] = self.__instanceSet.getOutputNumericValue( i, 0) print("self.__outputInteger[" + str(i) + "] = " + str(self.__outputInteger[i])) self.__output[ i] = self.__instanceSet.getOutputNominalValue( i, 0) if (self.__outputInteger[i] > self.__nClasses): self.__nClasses = self.__outputInteger[i] self.__nClasses = self.__nClasses + 1 print('Number of classes=' + str(self.__nClasses)) except Exception as error: print( "readClassificationSet: Exception in readSet, in readClassificationSet:" + str(error)) self.computeStatistics() self.computeInstancesPerClass()