Esempio n. 1
0
    def readRegressionSet(self, datasetFile, train, file_path):

        try:
            # Load in memory a dataset that contains a regression problem
            self.instance_set.readSet(datasetFile, train, file_path)
            self.ndata = self.instance_set.getNumInstances()
            self.ninputs = Attributes.getInputNumAttributes(Attributes)
            self.nvars = self.ninputs + Attributes.getOutputNumAttributes(
                Attributes)
            # print("In readRegressionSet , self.ndata is : " + str(self.ndata))
            # print("In readRegressionSet , self.ninputs is : " + str(self.ninputs))
            # print("In readRegressionSet , self.nvars is : " + str(self.nvars))

            # outputIntegerheck that there is only one output variable
            if Attributes.getOutputNumAttributes(Attributes) > 1:
                # print("Out put attribute: ")
                outPutAttHeader = Attributes.getOutputAttributesHeader(
                    Attributes)
                # print(outPutAttHeader)
                # print("This algorithm can not process MIMO datasets")
                # print("All outputs but the first one will be removed")
                exit(1)

            noOutputs = False
            if Attributes.getOutputNumAttributes(Attributes) < 1:
                # print("This algorithm can not process datasets without outputs")
                # print("Zero-valued output generated")
                noOutputs = True
                print("noOutputs = True, exit 1 !!!!!")
                exit(1)
            # Initialice and fill our own tables
            self.x_array = [[0.0 for y in range(self.ninputs)]
                            for x in range(self.ndata)]
            self.missing_array = [[False for y in range(self.ninputs)]
                                  for x in range(self.ndata)]
            self.output_integer_array = [0 for x in range(self.ndata)]

            # Maximum and minimum of inputs
            self.emax_array = [None for x in range(self.ninputs)]
            self.emin_array = [None for x in range(self.ninputs)]
            for i in range(0, self.ninputs):
                self.emax_array[i] = Attributes.getAttributeByPos(
                    Attributes, i).getMaxAttribute()
                self.emin_array[i] = Attributes.getAttributeByPos(
                    Attributes, i).getMinAttribute()

            # All values are casted into double / integer
            self.nclasses = 0

            for i in range(0, self.ndata):
                inst = self.instance_set.getInstance(i)
                for j in range(0, self.ninputs):
                    self.x_array[i][
                        j] = self.instance_set.getInputNumericValue(
                            i, j)  # inst.getInputRealValues(j);
                    self.missing_array[i][j] = inst.getInputMissingValues(j)
                    if self.missing_array[i][j]:
                        self.x_array[i][j] = self.emin_array[j] - 1

                if noOutputs:
                    print("noOutputs self.output_real_array[i]" + str(i) +
                          "is 0 ")
                    self.output_real_array[i] = 0

                    self.output_integer_array[i] = 0

                else:
                    print("noOutputs else part:")

                    self.output_real_array[
                        i] = self.instance_set.getOutputNumericValue(i, 0)
                    print("self.output_real_array[i]" + str(i) +
                          str(self.output_real_array[i]))
                    self.output_integer_array[i] = int(
                        self.output_real_array[i])
        except OSError as error:
            print("OS error: {0}".format(error))
        except Exception as otherException:
            # print("DBG: Exception in readSet:", sys.exc_info()[0])
            print(" In readRegressionSet other Exception  is :" +
                  str(otherException))

        self.computeStatistics()
Esempio n. 2
0
    def read_classification_set(self, dataset_file, train, file_path):
        try:
            # Load in memory a dataset that contains a classification problem
            print("Inside read_classification_set, datasetFile :" +
                  str(dataset_file))
            # print("train is :" + str(train))
            # print("object instanceSet is :" + str(self.instance_set))

            if self.instance_set is None:
                print("self.instance_set is Null")
            else:
                no_outputs = None
                print("self.instance_set is not None, train = " + str(train))
                self.instance_set.read_set(dataset_file, train, file_path)
                print(
                    "begin getNumInstances ...... in read_classification_set ")
                self.ndata = self.instance_set.getNumInstances()
                print(
                    "In readCread_classification_setlassificationSet , self.ndata is : "
                    + str(self.ndata))
                self.ninputs = Attributes.getInputNumAttributes(Attributes)
                print("In read_classification_set , self.ninputs is : " +
                      str(self.ninputs))
                self.nvars = self.ninputs + Attributes.getOutputNumAttributes(
                    Attributes)
                print("In read_classification_set , self.nvars is : " +
                      str(self.nvars))

                # outputInteger check that there is only one output variable
                if Attributes.getOutputNumAttributes(Attributes) > 1:
                    outAttrs = Attributes.getOutputAttributes(Attributes)
                    # print("Output Attributes number is bigger than 1")
                    i = 1
                    for outAtt in outAttrs:
                        # print("Att" + str(i) + str(outAtt.getName()))
                        i = i + 1
                    # print("" + Attributes.getOutputAttributesHeader(Attributes))
                    print(
                        "This algorithm can not process MIMO datasets !!! exit 1"
                    )
                    # print("All outputs but the first one will be removed")
                    exit(1)
                no_outputs = False
                if Attributes.getOutputNumAttributes(Attributes) < 1:
                    print(
                        "This algorithm can not process datasets without outputs !!!!!!"
                    )
                    # print("Zero-valued output generated")
                    no_outputs = True
                    exit(1)

                # print("define all the array in MyDataSet class......")
                # Initialice and fill our own tables
                # print("The two dimension array X, dimension 1 is :" + str(self.ndata) + " ,Dimension 2 is :" + str(self.ninputs))

                ndata_length = self.ndata
                ninput_length = self.ninputs
                print("nDataLength = " + str(ndata_length))
                # print("nInputLength = " + str(nInputLength))
                # [[0 for j in range(m)] for i in range(n)] first column, then row

                self.x_array = [[0.0 for y in range(ninput_length)]
                                for x in range(ndata_length)]

                self.missing_array = [[True for y in range(ninput_length)]
                                      for x in range(ndata_length)]

                self.nominal_array = [True for x in range(ninput_length)]
                self.integer_array = [True for x in range(ninput_length)]

                self.output_integer_array = [0 for x in range(ndata_length)]

                self.output_real_array = [0.0 for x in range(ndata_length)]
                self.output_array = ["" for x in range(ndata_length)]

                # Maximum and minimum of inputs
                self.emax = [0.0 for x in range(ninput_length)]
                self.emin = [0.0 for x in range(ninput_length)]

                for i in range(0, ninput_length):

                    attribute_instance: Attribute = Attributes.getInputAttribute(
                        Attributes, i)

                    if attribute_instance.getNumNominalValues() > 0:
                        self.emin[i] = 0
                        self.emax[i] = Attributes.getInputAttribute(
                            i).getNumNominalValues() - 1
                    else:
                        self.emax[i] = Attributes.getAttributeByPos(
                            Attributes, i).getMaxAttribute()
                        self.emin[i] = Attributes.getAttributeByPos(
                            Attributes, i).getMinAttribute()

                    if attribute_instance.getType() == Attribute.NOMINAL:
                        self.nominal_array[i] = True
                        self.integer_array[i] = False
                    elif attribute_instance.getType() == Attribute.INTEGER:
                        self.nominal_array[i] = False
                        self.integer_array[i] = True
                    else:
                        self.nominal_array[i] = False
                        self.integer_array[i] = False

                    # print("self.emax[n]:" + str(self.emax[n]))
                    # print("self.emin[n]:" + str(self.emin[n]))
                # All values are casted into double/integer

                self.nclasses = 0
                for i in range(0, ndata_length):
                    inst = self.instance_set.getInstance(i)
                    for j in range(0, ninput_length):
                        input_Numeric_Value = self.instance_set.getInputNumericValue(
                            i, j)
                        # # print("self.x_array [i] = " + str(i) + ",[j] = " + str(j) + ",input_Numeric_Value:" + str(
                        #  input_Numeric_Value))

                        self.x_array[i][
                            j] = input_Numeric_Value  # inst.getInputRealValues(j);
                        # # print("after get self.x_array[i][j]")
                        self.missing_array[i][
                            j] = inst.getInputMissingValuesWithPos(j)
                        # # print("after self.missing_array[i][j]")
                        if self.missing_array[i][j]:
                            self.x_array[i][j] = self.emin[j] - 1

                    if no_outputs:
                        # print("no_outputs==True")
                        self.output_integer_array[i] = 0
                        # elf.output_real_array[i] = 0.0
                        self.output_array[i] = ""
                    else:
                        # print("no_outputs==False")
                        self.output_integer_array[
                            i] = self.instance_set.getOutputNumericValue(i, 0)
                        # print(" 202001-1 self.output_integer_array[ "+str(i)+"]"+ str( self.output_integer_array[i]))
                        # self.output_real_array[i] = self.instance_set.getOutputNumericValue(i, 0)
                        # print("self.output_integer_array[" + str(i) + "] = " + str(self.output_integer_array[i]))
                        self.output_array[
                            i] = self.instance_set.getOutputNominalValue(i, 0)
                    # print(" 202001-1 self.output_integer_array[ " + str(i) + "]" + str(self.output_integer_array[i]))
                    if self.output_integer_array[i] > self.nclasses:
                        self.nclasses = self.output_integer_array[i]

                self.nclasses = self.nclasses + 1
                print('Number of classes=' + str(self.nclasses))
        except Exception as error:
            print(
                "read_classification_set: Exception in readSet, in read_classification_set:"
                + str(error))

        # self.computeStatistics()
        self.compute_instances_per_class()
Esempio n. 3
0
    def readClassificationSet(self, datasetFile, train):
        try:
            # Load in memory a dataset that contains a classification problem
            print("Inside readClassificationSet, datasetFile :" +
                  str(datasetFile))
            print("train is :" + str(train))
            print("object instanceSet is :" + str(self.__instanceSet))
            if (self.__instanceSet is None):
                print("self.__instanceSet is Null")
            else:
                print("self.__instanceSet is not None, train = " + str(train))
                self.__instanceSet.readSet(datasetFile, train)
                print("begin getNumInstances ...... in readClassificationSet ")
                self.__nData = self.__instanceSet.getNumInstances()
                print("In readClassificationSet , self.__nData is : " +
                      str(self.__nData))
                self.__nInputs = Attributes.getInputNumAttributes(Attributes)
                print("In readClassificationSet , self.__nInputs is : " +
                      str(self.__nInputs))
                self.__nVars = self.__nInputs + Attributes.getOutputNumAttributes(
                    Attributes)
                print("In readClassificationSet , self.__nVars is : " +
                      str(self.__nVars))

                # outputIntegerheck that there is only one output variable
                if (Attributes.getOutputNumAttributes(Attributes) > 1):
                    outAttrs = Attributes.getOutputAttributes(Attributes)
                    print("Output Attributes number is bigger than 1")
                    for outAtt in outAttrs:
                        i = 1
                        print("Att" + str(i) + str(outAtt.getName()))
                        i = i + 1
                    print("" +
                          Attributes.getOutputAttributesHeader(Attributes))
                    print("This algorithm can not process MIMO datasets")
                    print("All outputs but the first one will be removed")
                    exit(1)
                noOutputs = False
                if (Attributes.getOutputNumAttributes(Attributes) < 1):
                    print(
                        "This algorithm can not process datasets without outputs"
                    )
                    print("Zero-valued output generated")
                    noOutputs = True
                    exit(1)

                print("define all the array in MyDataSet class......")
                #Initialice and fill our own tables
                print("The two dimension array X, dimension 1 is :" +
                      str(self.__nData) + " ,Dimension 2 is :" +
                      str(self.__nInputs))

                nDataLength = self.__nData
                nInputLength = self.__nInputs
                print("nDataLength = " + str(nDataLength))
                print("nInputLength = " + str(nInputLength))

                #[[0 for j in range(m)] for i in range(n)] first column, then row

                self.__X = [[None for y in range(nInputLength)]
                            for x in range(nDataLength)]
                self.__nominal = [None for x in range(nInputLength)]
                self.__integer = [None for x in range(nInputLength)]
                # boolean array
                self.__missing = [[None for y in range(nInputLength)]
                                  for x in range(nDataLength)]
                # boolean array
                self.__outputInteger = [None for x in range(nDataLength)]
                # boolean array
                self.__outputReal = [None for x in range(nDataLength)]

                self.__output = ["" for x in range(nDataLength)]

                # Maximum and minimum of inputs
                self.emax = [None for x in range(nInputLength)]
                self.emin = [None for x in range(nInputLength)]

                for i in range(0, nInputLength):
                    print("inside loop to get each min and max")
                    if Attributes.getInputAttribute(
                            Attributes, i).getNumNominalValues() > 0:
                        print(
                            "getInputAttribute(self,i).getNumNominalValues() > 0"
                        )
                        self.emin[i] = 0
                        self.emax[i] = Attributes.getInputAttribute(
                            Attributes, i).getNumNominalValues() - 1
                    else:
                        print(
                            "getInputAttribute(self,i).getNumNominalValues() < 0"
                        )
                        self.emin[i] = Attributes.getInputAttribute(
                            Attributes, i).getMinAttribute()
                        self.emax[i] = Attributes.getInputAttribute(
                            Attributes, i).getMaxAttribute()

                    if Attributes.getAttributes(
                            Attributes)[i].getType() == Attribute.NOMINAL:
                        print("Attribute.NOMINAL")
                        self.__nominal[i] = True
                        self.__integer[i] = False
                    elif Attributes.getInputAttribute(
                            Attributes, i).getType() == Attribute.INTEGER:
                        print("Attribute.INTEGER")
                        self.__nominal[i] = False
                        self.__integer[i] = True
                    else:
                        print("Attribute others begin ")
                        self.__nominal[i] = False
                        self.__integer[i] = False
                        print("Attribute others is finished")
                """
                    compare with chi algorithm
                    for n in range( 0,nInputLength):
                     self.emax[n] = Attributes.getAttributeByPos(Attributes,n).getMaxAttribute()
                     self.emin[n] = Attributes.getAttributeByPos(Attributes,n).getMinAttribute()
                     print("self.emax[n]:"+ str(self.emax[n]))
                     print("self.emin[n]:" + str(self.emin[n]))
                  """

                # All values are casted into double/integer
                self.__nClasses = 0
                print("Before loop of set X[][] value")
                for i in range(0, nDataLength):
                    inst = self.__instanceSet.getInstance(i)
                    for j in range(0, nInputLength):
                        input_Numeric_Value = self.__instanceSet.getInputNumericValue(
                            i, j)
                        #print("self.__X [i] = "+ str(i)+",[j] = "+ str(j)+",input_Numeric_Value:"+str(input_Numeric_Value))

                        self.__X[i][
                            j] = input_Numeric_Value  #inst.getInputRealValues(j);
                        #print("after get self.__X[i][j]")
                        self.__missing[i][
                            j] = inst.getInputMissingValuesWithPos(j)
                        #print("after self.__missing[i][j]")
                        if self.__missing[i][j]:
                            self.__X[i][j] = self.emin[j] - 1

                    if noOutputs:
                        print("noOutputs==True")
                        self.__outputInteger[i] = 0
                        self.__output[i] = ""
                    else:
                        #print("noOutputs==False")
                        self.__outputInteger[
                            i] = self.__instanceSet.getOutputNumericValue(
                                i, 0)
                        #print("self.__outputInteger["+str(i)+"] = "+str(self.__outputInteger[i]))
                        self.__output[
                            i] = self.__instanceSet.getOutputNominalValue(
                                i, 0)

                    if self.__outputInteger[i] > self.__nClasses:
                        self.__nClasses = self.__outputInteger[i]

                self.__nClasses = self.__nClasses + 1
                print('Number of classes=' + str(self.__nClasses))
        except Exception as error:
            print(
                "readClassificationSet: Exception in readSet, in readClassificationSet:"
                + str(error))

        self.computeInstancesPerClass()
class MyDataSet:
    # Number to represent type of variable real or double.
    REAL = 0
    # *Number to represent type of variable integer.*
    INTEGER = 1
    # *Number to represent type of variable nominal.*
    NOMINAL = 2

    x_array = []  # examples array
    missing_array = []  # possible missing values
    output_integer_array = [
    ]  # output of the data - set as integer values private
    output_real_array = []  # output of the data - set as double values
    output_array = []  # output of the data - set as string values
    emax_array = []  # max value of an attribute private
    emin_array = []  # min value of an attribute

    ndata = None  # Number of examples
    nvars = None  # Numer of variables
    ninputs = None  # Number of inputs
    nclasses = None  # Number of outputs

    instance_set = None  # The whole instance set
    stdev_array = []
    average_array = []  # standard deviation and average of each attribute
    instances_cl = []

    # nominal  attributes bool array
    nominal_array = []
    # integer   attributes int array
    integer_array = []

    frequent_class_array = []
    attributes = None

    #  *Init a new set of instances

    def __init__(self):
        self.instance_set = InstanceSet()
        self.attributes = Attributes()

    # '''
    #    * Outputs an array of examples with their corresponding attribute values.
    #    * @return double[][] an array of examples with their corresponding attribute values
    #  '''
    def get_x(self):
        return self.x_array

    def set_x(self, x_parameter):
        self.x_array = x_parameter

    # '''
    #    * Output a specific example
    #    * @param pos int position (id) of the example in the data-set
    #    * @return double[] the attributes of the given example
    # '''
    def get_example(self, pos):
        # # print(" In getExample, len(self.x_array) = " + str(len(self.x_array)) + ", pos = " + str(
        #   pos) + "  ," + "self.x_array[pos] ==" + str(self.x_array[pos]))
        return self.x_array[pos]

    # * Returns the output of the data-set as integer values
    # * @return int[] an array of integer values corresponding to the output values of the dataset

    def get_output_as_integer(self):
        size = len(self.output_integer_array)
        output = [0 for x in range(size)]
        for i in range(0, size):
            output[i] = self.output_integer_array[i]
        return output

    #    * Returns the output of the data-set as real values
    #    * @return double[] an array of real values corresponding to the output values of the dataset

    def get_output_as_real(self):
        output_length = len(self.output_real_array)
        output = [0.0 for x in range(output_length)]
        for i in range(0, len(self.output_real_array)):
            output[i] = self.output_integer_array[i]
        return output

    #    * Returns the output of the data-set as nominal values
    #    * @return String[] an array of nomianl values corresponding to the output values of the dataset
    #

    def get_output_as_string(self):
        output_length = len(self.output_array)
        output = ["" for x in range(output_length)]
        for i in range(0, output_length):
            output[i] = self.output_array[i]

        return output

    #    * It returns the output value of the example "pos"
    #    * @param pos int the position (id) of the example
    #    * @return String a string containing the output value

    def get_output_as_string_with_pos(self, pos):
        # # print("pos is in getOutputAsStringWithPos "+str(pos))
        # maybe the exception is here.
        return self.output_array[pos]

    #    * It returns the output value of the example "pos"
    #    * @param pos int the position (id) of the example
    #    * @return int an integer containing the output value

    def get_output_as_integer_with_pos(self, pos):
        return self.output_integer_array[pos]

    def set_output_integer_array(self, integer_array):
        self.output_integer_array = integer_array

    def set_output_array(self, output_array):
        self.output_array = output_array

    #    * It returns the output value of the example "pos"
    #    * @param pos int the position (id) of the example
    #    * @return double a real containing the output value

    def get_output_as_real_with_pos(self, pos):
        return self.output_real_array[pos]

        # *It returns an array with the maximum values of the attributes
        # * @ return double[] an array with the maximum values of the attributes
        #

    def get_emax(self):
        return self.emax_array

        # *It returns an array with the minimum values of the attributes
        # * @ return double[] an array with the minimum values of the attributes

    def get_emin(self):
        return self.emin_array

    # *It returns the maximum value of the given attribute
    # *
    # * @ param variable the index of the attribute
    # * @ return the maximum value of the given attribute

    def get_max(self, variable):
        return self.emax_array[variable]

    # *It returns the minimum value of the given attribute
    #
    # * @ param variable the index of the attribute
    # * @ return the minimum value of the given attribute

    def get_min(self, variable):
        return self.emin_array[variable]

    # *It gets the size of the data - set
    # * @ return int the number of examples in the data - set

    def get_ndata(self):

        return self.ndata

    def set_ndata(self, ndata):
        self.ndata = ndata

    # *It gets the number of variables of the data - set(including the output)
    # * @ return int the number of variables of the data - set(including the output)

    # modified at 2020-08-14
    def get_nvars(self):
        return self.nvars

    #    * It gets the number of input attributes of the data-set
    #    * @return int the number of input attributes of the data-set

    def get_ninputs(self):
        return self.ninputs

    def set_ninputs(self, ninputs_value):
        self.ninputs = ninputs_value

    #    * It gets the number of output attributes of the data-set (for example number of classes in classification)
    #    * @return int the number of different output values of the data-set

    def get_nclasses(self):
        return self.nclasses

    def set_nclasses(self, nclasses_value):
        self.nclasses = nclasses_value

    # added by rui for granularity rule generation
    def calculate_nclasses_for_small_granularity_zone(self,
                                                      output_integer_array):
        class_number = 0
        class_array = []
        has_class = False
        for i in range(0, len(output_integer_array)):
            # # print(" output_integer_array[i] " + str(output_integer_array[i]))
            if len(class_array) == 0:
                class_array.append(output_integer_array[i])
            else:
                has_class = False
                for j in range(0, len(class_array)):
                    if class_array[j] == output_integer_array[i]:
                        # # print(" class_array[j] " + str(class_array[j]))
                        has_class = True
                if not has_class:
                    class_array.append(output_integer_array[i])
        class_number = len(class_array)
        return class_number

    #  * This function checks if the attribute value is missing
    #  * @param i int Example id
    #  * @param j int Variable id
    #  * @return boolean True is the value is missing, else it returns false

    def is_missing(self, i, j):
        return self.missing_array[i][j]

    #  * It reads the whole input data-set and it stores each example and its associated output value in
    #  * local arrays to ease their use.
    #  * @param datasetFile String name of the file containing the dataset
    #  * @param train boolean It must have the value "true" if we are reading the training data-set
    #  * @throws IOException If there ocurs any problem with the reading of the data-set

    def read_classification_set(self, dataset_file, train, file_path):
        try:
            # Load in memory a dataset that contains a classification problem
            print("Inside read_classification_set, datasetFile :" +
                  str(dataset_file))
            # print("train is :" + str(train))
            # print("object instanceSet is :" + str(self.instance_set))

            if self.instance_set is None:
                print("self.instance_set is Null")
            else:
                no_outputs = None
                print("self.instance_set is not None, train = " + str(train))
                self.instance_set.read_set(dataset_file, train, file_path)
                print(
                    "begin getNumInstances ...... in read_classification_set ")
                self.ndata = self.instance_set.getNumInstances()
                print(
                    "In readCread_classification_setlassificationSet , self.ndata is : "
                    + str(self.ndata))
                self.ninputs = self.attributes.getInputNumAttributes()
                print("In read_classification_set , self.ninputs is : " +
                      str(self.ninputs))
                self.nvars = self.ninputs + self.attributes.getOutputNumAttributes(
                )
                print("In read_classification_set , self.nvars is : " +
                      str(self.nvars))

                # outputInteger check that there is only one output variable
                if self.attributes.getOutputNumAttributes() > 1:
                    outAttrs = self.attributes.getOutputAttributes()
                    # print("Output Attributes number is bigger than 1")
                    i = 1
                    for outAtt in outAttrs:
                        # print("Att" + str(i) + str(outAtt.getName()))
                        i = i + 1
                    # print("" + Attributes.getOutputAttributesHeader(Attributes))
                    print(
                        "This algorithm can not process MIMO datasets !!! exit 1"
                    )
                    # print("All outputs but the first one will be removed")
                    exit(1)
                no_outputs = False
                if self.attributes.getOutputNumAttributes() < 1:
                    print(
                        "This algorithm can not process datasets without outputs !!!!!!"
                    )
                    # print("Zero-valued output generated")
                    no_outputs = True
                    exit(1)

                # print("define all the array in MyDataSet class......")
                # Initialice and fill our own tables
                # print("The two dimension array X, dimension 1 is :" + str(self.ndata) + " ,Dimension 2 is :" + str(self.ninputs))

                ndata_length = self.ndata
                ninput_length = self.ninputs
                print("nDataLength = " + str(ndata_length))
                # print("nInputLength = " + str(nInputLength))
                # [[0 for j in range(m)] for i in range(n)] first column, then row

                self.x_array = [[0.0 for y in range(ninput_length)]
                                for x in range(ndata_length)]

                self.missing_array = [[True for y in range(ninput_length)]
                                      for x in range(ndata_length)]

                self.nominal_array = [True for x in range(ninput_length)]
                self.integer_array = [True for x in range(ninput_length)]

                self.output_integer_array = [0 for x in range(ndata_length)]

                self.output_real_array = [0.0 for x in range(ndata_length)]
                self.output_array = ["" for x in range(ndata_length)]

                # Maximum and minimum of inputs
                self.emax = [0.0 for x in range(ninput_length)]
                self.emin = [0.0 for x in range(ninput_length)]

                for i in range(0, ninput_length):

                    attribute_instance = self.attributes.getInputAttribute(i)

                    if attribute_instance.getNumNominalValues() > 0:
                        self.emin[i] = 0
                        self.emax[i] = self.attributes.getInputAttribute(
                            i).getNumNominalValues() - 1
                    else:
                        self.emax[i] = self.attributes.getAttributeByPos(
                            i).getMaxAttribute()
                        self.emin[i] = self.attributes.getAttributeByPos(
                            i).getMinAttribute()

                    if attribute_instance.getType() == Attribute.NOMINAL:
                        self.nominal_array[i] = True
                        self.integer_array[i] = False
                    elif attribute_instance.getType() == Attribute.INTEGER:
                        self.nominal_array[i] = False
                        self.integer_array[i] = True
                    else:
                        self.nominal_array[i] = False
                        self.integer_array[i] = False

                    # print("self.emax[n]:" + str(self.emax[n]))
                    # print("self.emin[n]:" + str(self.emin[n]))
                # All values are casted into double/integer

                self.nclasses = 0
                for i in range(0, ndata_length):
                    inst = self.instance_set.getInstance(i)
                    for j in range(0, ninput_length):
                        input_Numeric_Value = self.instance_set.getInputNumericValue(
                            i, j)
                        # # print("self.x_array [i] = " + str(i) + ",[j] = " + str(j) + ",input_Numeric_Value:" + str(
                        #  input_Numeric_Value))

                        self.x_array[i][
                            j] = input_Numeric_Value  # inst.getInputRealValues(j);
                        # # print("after get self.x_array[i][j]")
                        self.missing_array[i][
                            j] = inst.getInputMissingValuesWithPos(j)
                        # # print("after self.missing_array[i][j]")
                        if self.missing_array[i][j]:
                            self.x_array[i][j] = self.emin[j] - 1

                    if no_outputs:
                        # print("no_outputs==True")
                        self.output_integer_array[i] = 0
                        # elf.output_real_array[i] = 0.0
                        self.output_array[i] = ""
                    else:
                        # print("no_outputs==False")
                        self.output_integer_array[
                            i] = self.instance_set.getOutputNumericValue(i, 0)
                        # print(" 202001-1 self.output_integer_array[ "+str(i)+"]"+ str( self.output_integer_array[i]))
                        # self.output_real_array[i] = self.instance_set.getOutputNumericValue(i, 0)
                        # print("self.output_integer_array[" + str(i) + "] = " + str(self.output_integer_array[i]))
                        self.output_array[
                            i] = self.instance_set.getOutputNominalValue(i, 0)
                    # print(" 202001-1 self.output_integer_array[ " + str(i) + "]" + str(self.output_integer_array[i]))
                    if self.output_integer_array[i] > self.nclasses:
                        self.nclasses = self.output_integer_array[i]

                self.nclasses = self.nclasses + 1
                print('Number of classes=' + str(self.nclasses))
        except Exception as error:
            print(
                "read_classification_set: Exception in readSet, in read_classification_set:"
                + str(error))

        # self.computeStatistics()
        self.compute_instances_per_class()

    #   * It reads the whole input data-set and it stores each example and its associated output value in
    #   * local arrays to ease their use.
    #   * @param datasetFile String name of the file containing the dataset
    #   * @param train boolean It must have the value "true" if we are reading the training data-set
    #   * @throws IOException If there ocurs any problem with the reading of the data-set

    # added by rui for granularity rule generation
    def read_classification_set_from_data_row_array(self, data_row_array):

        self.compute_statistics_data_row_array(data_row_array)
        self.compute_instances_perclass_data_row_array(data_row_array)

    def readRegressionSet(self, datasetFile, train, file_path):

        try:
            # Load in memory a dataset that contains a regression problem
            self.instance_set.readSet(datasetFile, train, file_path)
            self.ndata = self.instance_set.getNumInstances()
            self.ninputs = self.attributes.getInputNumAttributes()
            self.nvars = self.ninputs + self.attributes.getOutputNumAttributes(
            )
            # print("In readRegressionSet , self.ndata is : " + str(self.ndata))
            # print("In readRegressionSet , self.ninputs is : " + str(self.ninputs))
            # print("In readRegressionSet , self.nvars is : " + str(self.nvars))

            # outputIntegerheck that there is only one output variable
            if self.attributes.getOutputNumAttributes() > 1:
                # print("Out put attribute: ")
                outPutAttHeader = self.attributes.getOutputAttributesHeader()
                # print(outPutAttHeader)
                # print("This algorithm can not process MIMO datasets")
                # print("All outputs but the first one will be removed")
                exit(1)

            noOutputs = False
            if self.attributes.getOutputNumAttributes() < 1:
                # print("This algorithm can not process datasets without outputs")
                # print("Zero-valued output generated")
                noOutputs = True
                print("noOutputs = True, exit 1 !!!!!")
                exit(1)
            # Initialice and fill our own tables
            self.x_array = [[0.0 for y in range(self.ninputs)]
                            for x in range(self.ndata)]
            self.missing_array = [[False for y in range(self.ninputs)]
                                  for x in range(self.ndata)]
            self.output_integer_array = [0 for x in range(self.ndata)]

            # Maximum and minimum of inputs
            self.emax_array = [None for x in range(self.ninputs)]
            self.emin_array = [None for x in range(self.ninputs)]
            for i in range(0, self.ninputs):
                self.emax_array[i] = self.attributes.getAttributeByPos(
                    i).getMaxAttribute()
                self.emin_array[i] = self.attributes.getAttributeByPos(
                    i).getMinAttribute()

            # All values are casted into double / integer
            self.nclasses = 0

            for i in range(0, self.ndata):
                inst = self.instance_set.getInstance(i)
                for j in range(0, self.ninputs):
                    self.x_array[i][
                        j] = self.instance_set.getInputNumericValue(
                            i, j)  # inst.getInputRealValues(j);
                    self.missing_array[i][j] = inst.getInputMissingValues(j)
                    if self.missing_array[i][j]:
                        self.x_array[i][j] = self.emin_array[j] - 1

                if noOutputs:
                    print("noOutputs self.output_real_array[i]" + str(i) +
                          "is 0 ")
                    self.output_real_array[i] = 0

                    self.output_integer_array[i] = 0

                else:
                    print("noOutputs else part:")

                    self.output_real_array[
                        i] = self.instance_set.getOutputNumericValue(i, 0)
                    print("self.output_real_array[i]" + str(i) +
                          str(self.output_real_array[i]))
                    self.output_integer_array[i] = int(
                        self.output_real_array[i])
        except OSError as error:
            print("OS error: {0}".format(error))
        except Exception as otherException:
            # print("DBG: Exception in readSet:", sys.exc_info()[0])
            print(" In readRegressionSet other Exception  is :" +
                  str(otherException))

        self.computeStatistics()

    # *It copies the header of the dataset
    # * @ return String A string containing all the data - set information

    def copy_header(self):

        p = ""
        # # print("copyHeader begin...., P is :" + p)
        p = "@relation " + self.attributes.getRelationName() + "\n"
        # # print(" after relation P is :" + p)
        p += self.attributes.getInputAttributesHeader()
        # # print(" after getInputAttributesHeader P is :" + p)
        p += self.attributes.getOutputAttributesHeader()
        # # print(" after getOutputAttributesHeader P is :" + p)
        p += self.attributes.getInputHeader() + "\n"
        # # print(" after getInputHeader P is :" + p)
        p += self.attributes.getOutputHeader() + "\n"
        # # print(" after getOutputHeader P is :" + p)
        p += "@data\n"

        # print("P is :" + p)
        return p

    #    * It transform the input space into the [0,1] range

    def normalize(self):
        atts = self.getn_inputs()
        maxs = [0.0 for x in range(atts)]
        for j in range(0, atts):
            maxs[j] = 1.0 / (self.emax_array[j] - self.emin_array[j])

        for i in range(0, self.get_ndata()):
            for j in range(0, atts):
                if not self.isMissing(
                        i, j):  # this process ignores missing values
                    self.x_array[i][j] = (self.x_array[i][j] -
                                          self.__emin[j]) * maxs[j]

    # * It checks if the data-set has any real value
    # * @return boolean True if it has some real values, else false.

    def has_real_attributes(self):
        return Attributes.hasRealAttributes(self)

    #    * It checks if the data-set has any real value
    #    * @return boolean True if it has some real values, else false.

    def has_numerical_attributes(self):
        return Attributes.hasIntegerAttributes(
            self) or Attributes.hasRealAttributes(self)

    #    * It checks if the data-set has any missing value
    #    * @return boolean True if it has some missing values, else false.

    def has_missing_attributes(self):
        return self.size_without_missing() < self.get_ndata()

    #    * It return the size of the data-set without having account the missing values
    #    * @return int the size of the data-set without having account the missing values

    def size_without_missing(self):
        tam = 0
        # # print("self.ndata is :" + str(self.ndata) + ", self.ninputs :" + str(self.ninputs))
        for i in range(0, self.ndata):
            for j in range(1, self.ninputs):
                # changed the isMissing condition inside if
                if self.is_missing(i, j):
                    # print("It is missing value is i = " + str(i) + ",j==" + str(j))
                    break
            j = j + 1
            # # print("sizeWithoutMissing,  i = " + str(i) + ",j==" + str(j))
            if j == self.ninputs:
                tam = tam + 1
        # print("tam=" + str(tam))
        return tam

    #    * It returns the number of examples
    #    *
    #    * @return the number of examples

    def size(self):
        return self.ndata

    #    * It computes the average and standard deviation of the input attributes

    def compute_statistics(self):
        try:
            print("Begin computeStatistics......")
            var_num = self.get_nvars()
            print("varNum = " + str(var_num))
            self.stdev_array = [
                0.0 for x in range(var_num)
            ]  # original was double ,changed into float in python
            self.average_array = [0.0 for x in range(var_num)]

            input_num = self.getn_inputs()
            data_num = self.get_ndata()
            print("inputNum = " + str(input_num) + ",dataNum = " +
                  str(data_num))
            for i in range(0, input_num):
                self.average_array[i] = 0
                for j in range(0, data_num):
                    if not self.isMissing(j, i):
                        self.average_array[
                            i] = self.average_array[i] + self.x_array[j][i]
                if data_num != 0:
                    self.average_array[i] = self.average_array[i] / data_num
            average_length = len(self.average_array)
            print(" average_length is " + str(average_length))
            self.average_array[average_length - 1] = 0
            if len(self.output_real_array) == 0:
                print("len(self.output_real_array) is  0")

            else:
                # print("len(self.output_real_array) is " + str(len(self.output_real_array)))
                for j in range(0, len(self.output_real_array)):
                    # print("self.output_real_array[j] is : "+str(self.output_real_array[j]) + " ,j is :"+str(j))
                    self.average_array[average_length - 1] = self.average_array[average_length - 1] + \
                                                             self.output_real_array[j]
            if len(self.output_real_array) != 0:
                self.average_array[average_length - 1] = self.average_array[
                    average_length - 1] / len(self.output_real_array)
                print("before the loop for inputNum")
                for i in range(0, input_num):
                    sum_value = 0.0
                    for j in range(0, data_num):
                        if not self.isMissing(j, i):
                            # print("self.isMissing(j, i)==False")
                            sum_value = sum_value + (
                                self.x_array[j][i] - self.average_array[i]) * (
                                    self.x_array[j][i] - self.average_array[i])

                    if data_num != 0:
                        # print("dataNum != 0" + " , dataNum=" + str(data_num))
                        sum_value = sum_value / data_num
                    self.stdev_array[i] = math.sqrt(sum_value)

                sum_value = 0.0
                for j in range(0, len(self.output_real_array)):
                    sum_value += (self.output_real_array[j] -
                                  self.average_array[average_length - 1]) * (
                                      self.output_real_array[j] -
                                      self.average_array[average_length - 1])
                if len(self.output_real_array) != 0:
                    sum_value /= len(self.output_real_array)
                self.stdev_array[len(self.stdev_array) -
                                 1] = math.sqrt(sum_value)
                print("sum is :" + str(sum_value) + "  self.stdev_array :" +
                      str(self.stdev_array))
        except Exception as error:
            print("Exception in computeStatistics : " + str(error))

    #    * It return the standard deviation of an specific attribute
    #    * @param position int attribute id (position of the attribute)
    #    * @return double the standard deviation  of the attribute

    def std_dev(self, position):
        return self.stdev_array[position]

    #    * It return the average of an specific attribute
    #    * @param position int attribute id (position of the attribute)
    #    * @return double the average of the attribute

    def average(self, position):
        return self.average_array[position]

    #     *It computes the number of examples per class

    def compute_instances_per_class(self):
        # print("compute_instances_per_class begin..., self.nclasses = " + str(self.nclasses))
        self.instances_cl = [0 for x in range(self.nclasses)]
        self.frequent_class_array = [
            Decimal(0.0) for x in range(self.nclasses)
        ]
        data_num = self.get_ndata()
        # print("dataNum = " + str(dataNum))

        for i in range(0, data_num):
            integer_in_loop = self.output_integer_array[i]
            # # print("outputInteger[" + str(i) + "]" + str(integerInLoop))
            self.instances_cl[
                integer_in_loop] = self.instances_cl[integer_in_loop] + 1

        for i in range(0, self.nclasses):
            if data_num is 0:
                self.frequent_class_array[i] = 0
            else:
                self.frequent_class_array[i] = (1.0 * self.instances_cl[i] /
                                                data_num)

    #     *It returns the number of examples for a given class
    #     * @ param clas int the class label id
    #     * @ return int the number of examples
    #     for the class

    def number_instances(self, clas):
        return self.instances_cl[clas]

    # /**
    #  * It returns the number of labels for a nominal attribute
    #  * @param attribute int the attribute position in the data-set
    #  * @return int the number of labels for the attribute
    #  */
    #

    def number_values(self, attribute):
        return Attributes.getInputAttribute(attribute).getNumNominalValues(
            Attributes)

    #    * It returns the class label (string) given a class id (int)
    #    * @param intValue int the class id
    #    * @return String the corrresponding class label
    #

    #    * It returns the class label (string) given a class id (int)
    #    * @param intValue int the class id
    #    * @return String the corrresponding class label

    def get_output_value(self, int_value):
        # # print("Before att get ")
        att = Attributes.getOutputAttribute(Attributes, 0)
        # # print("After att get ")
        return att.getNominalValue(int_value)

    #  * It returns the type of the variable
    #  * @param variable int the variable id
    #  * @return int a code for the type of the variable (INTEGER, REAL or NOMINAL)

    def get_type(self, variable):
        if self.attributes.getAttributeByPos(
                variable).getType() == Attributes.getAttributeByPos(0).INTEGER:
            return self.INTEGER

        if self.attributes.getAttributeByPos(
                variable).getType() == Attributes.getAttributeByPos(0).REAL:
            return self.REAL

        if self.attributes.getAttributeByPos(
                variable).getType() == Attributes.getAttributeByPos(0).NOMINAL:
            return self.NOMINAL

        return 0

    #  * It returns the discourse universe for the input and output variables
    #  * @return double[][] The minimum [0] and maximum [1] range of each variable
    def set_nvars(self, nvar_value):
        self.nvars = nvar_value

    # modified at 2020-08-14
    def get_ranges(self):

        # print("self.get_nvars()" + str(self.get_nvars()))
        rangos = [[0.0 for y in range(2)] for x in range(self.get_nvars())]
        # print("rangos has two dimensions, first is self.get_nvars()==" + str(self.getn_inputs()) + ",second is 2")
        ninputs = self.get_ninputs()
        for i in range(0, ninputs):
            # print("self.getn_inputs() is :" + str(nInputs) + " i = " + str(i))
            attHere = self.attributes.getInputAttribute(i)
            # print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues()))
            if attHere.getNumNominalValues() > 0:
                rangos[i][0] = 0.0
                rangos[i][1] = attHere.getNumNominalValues() - 1
                # print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1]))

            else:
                rangos[i][0] = attHere.getMinAttribute()
                rangos[i][1] = attHere.getMaxAttribute()
                # print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1]))
        # save the output rango in the last range array
        rangos[self.get_nvars() -
               1][0] = self.attributes.getOutputAttribute(0).getMinAttribute()
        rangos[self.get_nvars() -
               1][1] = self.attributes.getOutputAttribute(0).getMaxAttribute()
        return rangos

    def get_granularity_zone_ranges(self, data_set_x_array):

        # print("self.get_nvars()" + str(self.get_nvars()))
        rangos = [[0.0 for y in range(2)] for x in range(self.get_nvars())]
        # print("rangos has two dimensions, first is self.get_nvars()==" + str(self.get_nvars()) + ",second is 2")
        nInputs = self.getn_inputs()
        for i in range(0, nInputs):
            # print("self.getn_inputs() is :" + str(nInputs) + " i = " + str(i))
            attHere = Attributes.getInputAttribute(Attributes, i)
            # print("attHere.getNumNominalValues()== " + str(attHere.getNumNominalValues()))
            if attHere.getNumNominalValues() > 0:
                rangos[i][0] = 0.0
                rangos[i][1] = attHere.getNumNominalValues() - 1
                # print(" attHere.getNumNominalValues() > 0,rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1]))

            else:
                rangos[i][0] = attHere.get_min_granularity_attribute(
                    data_set_x_array, i)
                rangos[i][1] = attHere.get_max_granularity_attribute(
                    data_set_x_array, i)
                # print(" attHere.getNumNominalValues() <= 0, rangos[" + str(i) + "][0]==" + str(rangos[i][0]) + ",rangos[i][1]== " + str(rangos[i][1]))
        last_min_value = self.attributes.getOutputAttribute(
            0).getMinAttribute()
        last_max_value = self.attributes.getOutputAttribute(
            0).getMaxAttribute()
        # print("The last_min_value is " + str(last_min_value)+" The last_max_value is " + str(last_max_value))
        rangos[self.get_nvars() - 1][0] = last_min_value
        rangos[self.get_nvars() - 1][1] = last_max_value
        return rangos

    #    * It returns the attribute labels for the input features
    #    * @return String[] the attribute labels for the input features

    def get_names(self):
        names = ["" for x in range(self.ninputs)]
        for i in range(0, self.ninputs):
            names[i] = self.attributes.getInputAttribute(i).getName()
            print(" attributes' names[" + str(i) + "]:" + names[i])
        return names

    #    * It returns the class labels
    #    * @return String[] the class labels

    def get_classes(self):
        clases = ["" for x in range(self.nclasses)]
        # print(" getClasses,self.nclasses: " + str(self.nclasses))
        for i in range(0, self.nclasses):
            # print(" getClasses method i is "+str(i))
            clases[i] = self.attributes.getOutputAttribute(0).getNominalValue(
                i)
        return clases

    def is_nominal(self, index_i):
        return self.nominal_array[index_i]

    def is_integer(self, index_i):

        return self.integer_array[index_i]

    def get_frequent_class(self, class_value):
        print("class_value" + str(class_value))
        return self.frequent_class_array[class_value]

    """

     * It gets the number of input attributes of the data-set
     * @return int the number of input attributes of the data-set
    """

    def get_ninputs(self):
        return self.ninputs

    """
     * It returns the ratio of instances of the given class in the dataset
     *
     * @param clas the index of the class
     * @return the ratio of instances of the given class in the dataset
    """

    def frecuent_class(self, class_value):
        return self.frequent_class_array[class_value]

    def get_X(self):

        return np.array(self.x_array)

    def get_y(self, type_name='integer'):

        if type_name == "real":
            return np.array(self.output_real_array)
        elif type_name == "integer":
            return np.array(self.output_integer_array)
        else:
            return np.array(self.output_array)

    def copyHeader(self):

        p = ""
        # # print("copyHeader begin...., P is :" + p)
        p = "@relation " + self.attributes.getRelationName() + "\n"
        # # print(" after relation P is :" + p)
        p += self.attributes.getInputAttributesHeader()
        # # print(" after getInputAttributesHeader P is :" + p)
        p += self.attributes.getOutputAttributesHeader()
        # # print(" after getOutputAttributesHeader P is :" + p)
        p += self.attributes.getInputHeader() + "\n"
        # # print(" after getInputHeader P is :" + p)
        p += self.attributes.getOutputHeader() + "\n"
        # # print(" after getOutputHeader P is :" + p)
        p += "@data\n"

        # print("P is :" + p)
        return p
Esempio n. 5
0
    def readRegressionSet(self, datasetFile, train):

        try:
            #Load in memory a dataset that contains a regression problem
            self.__instanceSet.readSet(datasetFile, train)
            self.__nData = self.__instanceSet.getNumInstances()
            self.__nInputs = Attributes.getInputNumAttributes(Attributes)
            self.__nVars = self.__nInputs + Attributes.getOutputNumAttributes(
                Attributes)
            print("In readRegressionSet , self.__nData is : " +
                  str(self.__nData))
            print("In readRegressionSet , self.__nInputs is : " +
                  str(self.__nInputs))
            print("In readRegressionSet , self.__nVars is : " +
                  str(self.__nVars))

            #outputIntegerheck that there is only one output variable
            if (Attributes.getOutputNumAttributes(Attributes) > 1):

                print("Out put attribute: ")
                outPutAttHeader = Attributes.getOutputAttributesHeader(
                    Attributes)
                print(outPutAttHeader)
                print("This algorithm can not process MIMO datasets")
                print("All outputs but the first one will be removed")
                exit(1)

            noOutputs = False
            if (Attributes.getOutputNumAttributes(Attributes) < 1):
                print(
                    "This algorithm can not process datasets without outputs")
                print("Zero-valued output generated")
                noOutputs = True
                exit(1)
            # Initialice and fill our own tables
            self.__X = [[0.0 for y in range(self.__nInputs)]
                        for x in range(self.__nData)]
            self.__missing = [[False for y in range(self.__nInputs)]
                              for x in range(self.__nData)]
            self.__outputInteger = [0 for x in range(self.__nData)]

            # Maximum and minimum of inputs
            self.__emax = [None for x in range(self.__nInputs)]
            self.__emin = [None for x in range(self.__nInputs)]
            for i in range(0, self.__nInputs):
                self.__emax[i] = Attributes.getAttributeByPos(
                    Attributes, i).getMaxAttribute()
                self.__emin[i] = Attributes.getAttributeByPos(
                    Attributes, i).getMinAttribute()

            # All values are casted into double / integer
            self.__nClasses = 0

            for i in range(0, self.__nData):
                inst = self.__instanceSet.getInstance(i)
                for j in range(0, self.__nInputs):
                    self.__X[i][j] = self.__instanceSet.getInputNumericValue(
                        i, j)  #inst.getInputRealValues(j);
                    self.__missing[i][j] = inst.getInputMissingValues(j)
                    if (self.__missing[i][j]):
                        self.__X[i][j] = self.__emin[j] - 1

                if (noOutputs):
                    self.__outputReal[i] = 0
                    self.__outputInteger[i] = 0

                else:
                    self.__outputReal[
                        i] = self.__instanceSet.getOutputNumericValue(i, 0)
                    self.__outputInteger[i] = int(self.__outputReal[i])
        except OSError as error:
            print("OS error: {0}".format(error))
        except Exception as otherException:
            print("DBG: Exception in readSet:", sys.exc_info()[0])
            print(" In readRegressionSet other Exception  is :" +
                  str(otherException))

        self.computeStatistics()