def open_data(name, flags=0):
    """ Open a named data-set return it. 
    """
    dataset = orange.ExampleTable(name)
    if flags & CONTINUIZE_DOMAIN:
        preprocessor = preprocess.Continuize()
        dataset = preprocessor(dataset)
    elif flags & DISCRETIZE_DOMAIN:
        preprocessor = preprocess.Discretize(method=orange.EquiNDiscretization(),
                                               discretize_class=False)
        dataset = preprocessor(dataset)
    dataset.name = name
    return dataset
Exemple #2
0
def discretizeDomain(data, removeUnusedValues=1, numberOfIntervals=2):
    entroDisc = orange.EntropyDiscretization()
    equiDisc = orange.EquiNDiscretization(numberOfIntervals=numberOfIntervals)
    discAttrs = []

    className = data and len(
        data
    ) > 0 and data.domain.classVar and data.domain.classVar.name or None
    #    if className:
    #        data = data.filterref(orange.Filter_hasClassValue())  # remove examples with missing classes

    if not data or len(data) == 0:
        return None

    # if we have a continuous class we have to discretize it before we can discretize the attributes
    if className and data.domain.classVar.varType == orange.VarTypes.Continuous:
        try:
            newClass = equiDisc(data.domain.classVar.name, data)
            newClass.name = className
        except orange.KernelException as ex:
            warnings.warn("Could not discretize class variable '%s'. %s" %
                          (data.domain.classVar.name, ex.message))
            newClass = None
            className = None
        newDomain = orange.Domain(data.domain.attributes, newClass)
        data = orange.ExampleTable(newDomain, data)

    for attr in data.domain.attributes:
        try:
            name = attr.name
            if attr.varType == orange.VarTypes.Continuous:  # if continuous attribute then use entropy discretization
                if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
                    new_attr = entroDisc(attr, data)
                else:
                    new_attr = equiDisc(attr, data)
            else:
                new_attr = attr
            if removeUnusedValues:
                new_attr = orange.RemoveUnusedValues(new_attr, data)
                if new_attr is None:
                    raise orange.KernelException("No values")

            new_attr.name = name
            discAttrs.append(new_attr)
        except orange.KernelException as ex:  # if all values are missing, entropy discretization will throw an exception. in such cases ignore the attribute
            warnings.warn("Could not discretize %s attribute. %s" %
                          (attr.name, ex.message))

    if className: discAttrs.append(data.domain.classVar)
    d2 = data.translate(discAttrs, True)
    return d2
Exemple #3
0
def cforange_attribute_distance(input_dict):
    import orange
    import orngInteract
    inputdata = input_dict['dataset']
    discretizedData = None
    classInteractions = int(input_dict['classInteractions'])
    atts = inputdata.domain.attributes
    if len(atts) < 2:
        return None
    matrix = orange.SymMatrix(len(atts))
    matrix.setattr('items', atts)
    if classInteractions < 3:
        if inputdata.domain.hasContinuousAttributes():
            if discretizedData is None:
                try:
                    discretizedData = orange.Preprocessor_discretize(
                        inputdata,
                        method=orange.EquiNDiscretization(numberOfIntervals=4))
                except orange.KernelException, ex:
                    return None
            data = discretizedData
        else:
            data = inputdata

        # This is ugly (no shit)
        if not data.domain.classVar:
            if classInteractions == 0:
                classedDomain = orange.Domain(
                    data.domain.attributes,
                    orange.EnumVariable("foo", values=["0", "1"]))
                data = orange.ExampleTable(classedDomain, data)
            else:
                return None

        im = orngInteract.InteractionMatrix(data, dependencies_too=1)
        off = 1
        if classInteractions == 0:
            diss, labels = im.exportChi2Matrix()
            off = 0
        elif classInteractions == 1:
            (diss, labels) = im.depExportDissimilarityMatrix(
                jaccard=1)  # 2-interactions
        else:
            (diss, labels) = im.exportDissimilarityMatrix(
                jaccard=1)  # 3-interactions

        for i in range(len(atts) - off):
            for j in range(i + 1):
                matrix[i + off, j] = diss[i][j]
Exemple #4
0
    def discretizeClass(self):
        if self.originalData:
            discType = self.classDiscretization
            classVar = self.originalData.domain.classVar

            if discType == 2:
                try:
                    content = self.customClassSplits.replace(":", " ").replace(",", " ").replace("-", " ").split()
                    customs = dict.fromkeys([float(x) for x in content]).keys()  # remove duplicates (except 8.0, 8.000 ...)
                    customs.sort()
                except:
                    customs = []

                if not customs:
                    discType = 0

            try:
                if discType == 0:
                    discretizer = orange.EquiNDiscretization(classVar, self.originalData, numberOfIntervals = self.classIntervals)
                elif discType == 1:
                    discretizer = orange.EquiDistDiscretization(classVar, self.originalData, numberOfIntervals = self.classIntervals)
                else:
                    discretizer = orange.IntervalDiscretizer(points = customs).constructVariable(classVar)

                self.discClassData = orange.ExampleTable(orange.Domain(self.originalData.domain.attributes, discretizer), self.originalData)
                if self.data:
                    self.data = self.discClassData
                # else, the data has no continuous attributes other then the class

                self.classIntervalsLabel.setText("Current splits: " + ", ".join([str(classVar(x)) for x in discretizer.getValueFrom.transformer.points]))
                self.error(0)
                self.warning(0)
                return True
            except:
                if self.data:
                    self.warning(0, "Cannot discretize the class; using previous class")
                else:
                    self.error(0, "Cannot discretize the class")
                self.classIntervalsLabel.setText("")
                return False
def create_dataset(file_base, num_bins):
    file_prefix = "data/"
    file_suffix = ".csv"
    train_in_file = file_prefix + file_base + "_train" + file_suffix
    train_out_file = file_prefix + file_base + "_bin_%s_train" % (
        num_bins) + file_suffix
    test_in_file = file_prefix + file_base + "_test" + file_suffix
    test_out_file = file_prefix + file_base + "_bin_%s_test" % (
        num_bins) + file_suffix

    train_data = np.genfromtxt(train_in_file, delimiter=',', skip_header=0)
    test_data = np.genfromtxt(train_in_file, delimiter=',', skip_header=0)

    num_features = train_data.shape[1]
    attributes = np.ndarray((1, num_features),
                            buffer=np.array(range(1, num_features + 1)))
    classes = np.ndarray(
        (1, num_features),
        buffer=np.array(["continuous" for i in range(num_features)]))
    orange_data = np.concatenate((attributes, classes, train_data))

    data_binned = orange.Preprocessor_discretize(orange_data,\
      method=orange.EquiNDiscretization(numberOfIntervals=num_bins)) #find cutoffs from orange

    for i in range(num_features):
        cutoffs_string = str(data_binned.domain.attributes[i].getValueFrom.
                             transformer.points).lstrip('<').rstrip('>')
        bins = [float(ele) for ele in cutoffs_string.split(", ")]

        train_digitized = np.digitize(train_data[:, i], bins)
        train_data[:, i] = train_digitized

        test_digitized = np.digitize(test_data[:, i], bins)
        test_data[:, i] = test_digitized

    np.savetxt(train_out_file, train_data, fmt="%d", delimiter=";")
    np.savetxt(test_out_file, test_data, fmt="%d", delimiter=";")
Exemple #6
0
def discretizeDomain(data, removeUnusedValues = 1, numberOfIntervals = 2):
    entroDisc = orange.EntropyDiscretization()
    equiDisc  = orange.EquiNDiscretization(numberOfIntervals = numberOfIntervals)
    discAttrs = []

    className = data and len(data) > 0 and data.domain.classVar and data.domain.classVar.name or None
#    if className:
#        data = data.filterref(orange.Filter_hasClassValue())  # remove examples with missing classes

    if not data or len(data) == 0:
        return None

    # if we have a continuous class we have to discretize it before we can discretize the attributes
    if className and data.domain.classVar.varType == orange.VarTypes.Continuous:
        newClass = equiDisc(data.domain.classVar.name, data)
        newClass.name = className
        newDomain = orange.Domain(data.domain.attributes, newClass)
        data = orange.ExampleTable(newDomain, data)

    for attr in data.domain.attributes:
        try:
            name = attr.name
            if attr.varType == orange.VarTypes.Continuous:  # if continuous attribute then use entropy discretization
                if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
                    attr = entroDisc(attr, data)
                else:
                    attr = equiDisc(attr, data)
            if removeUnusedValues:
                attr = orange.RemoveUnusedValues(attr, data)
            attr.name = name
            discAttrs.append(attr)
        except:     # if all values are missing, entropy discretization will throw an exception. in such cases ignore the attribute
            pass

    if className: discAttrs.append(data.domain.classVar)
    return data.select(discAttrs)
Exemple #7
0
    def computeMatrix(self):
        self.error()
        if self.data:
            atts = self.data.domain.attributes
            matrix = orange.SymMatrix(len(atts))
            matrix.setattr('items', atts)

            if self.classInteractions < 3:
                if self.data.domain.hasContinuousAttributes():
                    if self.discretizedData is None:
                        self.discretizedData = orange.Preprocessor_discretize(
                            self.data,
                            method=orange.EquiNDiscretization(
                                numberOfIntervals=4))
                    data = self.discretizedData
                else:
                    data = self.data

                # This is ugly, but: Aleks' code which computes Chi2 requires the class attribute because it prepares
                # some common stuff for all measures. If we want to use his code, we need the class variable, so we
                # prepare a fake one
                if not data.domain.classVar:
                    if self.classInteractions == 0:
                        classedDomain = orange.Domain(
                            data.domain.attributes,
                            orange.EnumVariable("foo", values=["0", "1"]))
                        data = orange.ExampleTable(classedDomain, data)
                    else:
                        self.error(
                            "The selected distance measure requires a data set with a class attribute"
                        )
                        return None

                im = orngInteract.InteractionMatrix(data, dependencies_too=1)
                off = 1
                if self.classInteractions == 0:
                    diss, labels = im.exportChi2Matrix()
                    off = 0
                elif self.classInteractions == 1:
                    (diss, labels) = im.depExportDissimilarityMatrix(
                        jaccard=1)  # 2-interactions
                else:
                    (diss, labels) = im.exportDissimilarityMatrix(
                        jaccard=1)  # 3-interactions

                for i in range(len(atts) - off):
                    for j in range(i + 1):
                        matrix[i + off, j] = diss[i][j]

            else:
                if self.classInteractions == 3:
                    for a1 in range(len(atts)):
                        for a2 in range(a1):
                            matrix[a1, a2] = orange.PearsonCorrelation(
                                a1, a2, self.data, 0).p
                else:
                    import numpy, statc
                    m = self.data.toNumpyMA("A")[0]
                    averages = numpy.ma.average(m, axis=0)
                    filleds = [
                        list(numpy.ma.filled(m[:, i], averages[i]))
                        for i in range(len(atts))
                    ]
                    for a1, f1 in enumerate(filleds):
                        for a2 in range(a1):
                            matrix[a1, a2] = statc.spearmanr(f1,
                                                             filleds[a2])[1]

            return matrix
        else:
            return None
Exemple #8
0
    def _prepare(self, t):
        # prepares an Orange table so that it doesn't contain continuous
        # attributes or missing values

        ### DISCRETIZE VARIABLES ###

        newatt = []
        oldatt = []
        entroD = orange.EntropyDiscretization()
        equiD = orange.EquiNDiscretization(numberOfIntervals=2)
        for i in t.domain.attributes:
            if i.varType == 2:
                d = entroD(i, t)
                if len(d.values) < 2:
                    # prevent discretization into a single value
                    d = equiD(i, t)
                    d.name = 'E' + d.name
                warnings.warn('Discretizing %s into %s with %d values.' %
                              (i.name, d.name, len(d.values)))
                newatt.append(d)
            else:
                oldatt.append(i)
        if len(newatt) > 0:
            t = t.select(oldatt + newatt + [t.domain.classVar])

        ### FIX MISSING VALUES ###

        special_attributes = []

        # 2006-08-23: fixed by PJ: append classVar only if it exists
        ##        all_attributes = [i for i in t.domain.attributes]+[t.domain.classVar]
        all_attributes = [i for i in t.domain.attributes]
        if t.domain.classVar:
            all_attributes += [t.domain.classVar]

        for i in range(len(all_attributes)):
            for j in t:
                if j[i].isSpecial():
                    special_attributes.append(i)
                    break
        # create new attributes
        if len(special_attributes) > 0:
            # prepare attributes
            newatts = []
            for i in range(len(all_attributes)):
                old = all_attributes[i]
                if i in special_attributes:
                    oldv = [v for v in old.values]
                    assert ('.' not in oldv)
                    new = orange.EnumVariable(name='M_' + old.name,
                                              values=oldv + ['.'])
                    warnings.warn('Removing special values from %s into %s.' %
                                  (old.name, new.name))
                    newatts.append(new)
                else:
                    newatts.append(old)
            # convert table
            exs = []

            # 2006-08-23: added by PJ: add a class variable (if not already existing)
            if not t.domain.classVar:
                newatts.append(orange.EnumVariable("class", values=["."]))
                t = orange.ExampleTable(
                    orange.Domain(t.domain.attributes, newatts[-1]), t)

            newd = orange.Domain(newatts)
            for ex in t:
                nex = []
                for i in range(len(newatts)):
                    if ex[i].isSpecial():
                        v = newatts[i]('.')
                    else:
                        v = newatts[i](int(ex[i]))
                    nex.append(v)
                exs.append(orange.Example(newd, nex))
            t = orange.ExampleTable(exs)
        return t
Exemple #9
0
# Description: Entropy based discretization compared to discretization with equal-frequency
#              of instances in intervals
# Category:    preprocessing
# Uses:        iris.tab
# Classes:     Preprocessor_discretize, EntropyDiscretization
# Referenced:  o_categorization.htm

import orange


def show_values(data, heading):
    print heading
    for a in data.domain.attributes:
        print "%s: %s" % (
            a.name, reduce(lambda x, y: x + ', ' + y, [i for i in a.values]))


data = orange.ExampleTable("iris")

data_ent = orange.Preprocessor_discretize(
    data, method=orange.EntropyDiscretization())
show_values(data_ent, "Entropy based discretization")
print

data_n = orange.Preprocessor_discretize(
    data, method=orange.EquiNDiscretization(numberOfIntervals=3))
show_values(data_n, "Equal-frequency intervals")
Exemple #10
0
# Description: Attribute-based discretization. Shows how different attributes may be discretized with different categorization methods and how the default attribute values names used by these methods may be simply replaced by the list of user-defined names.
# Category:    preprocessing
# Uses:        iris
# Classes:     EquiNDiscretization, EntropyDiscretization
# Referenced:  o_categorization.htm


def printexamples(data, inxs, msg="%i examples"):
    print msg % len(inxs)
    for i in inxs:
        print i, data[i]
    print


import orange
iris = orange.ExampleTable("iris")

equiN = orange.EquiNDiscretization(numberOfIntervals=4)
entropy = orange.EntropyDiscretization()

pl = equiN("petal length", iris)
sl = equiN("sepal length", iris)
pl.values = sl.values = ["very low", "low", "high", "very high"]
sl_ent = entropy("sepal length", iris)

inxs = [0, 15, 35, 50, 98]
d_iris = iris.select(
    ["sepal width", pl, "sepal length", sl, sl_ent, iris.domain.classVar])
printexamples(iris, inxs, "%i examples before discretization")
printexamples(d_iris, inxs, "%i examples before discretization")
class TestDiscretizeEquiN(testing.PreprocessorTestCase):
    PREPROCESSOR = Preprocessor_discretize(method=orange.EquiNDiscretization())
num_bins = 5
file_prefix = "data/"
file_suffix = ".csv"
file_base = "features"
in_file = file_prefix + file_base + file_suffix
out_file = file_prefix + file_base + "_bin_%s" % (num_bins) + file_suffix

data = np.genfromtxt(in_file, delimiter=',', skip_header=1)

num_features = data.shape[1]
attributes = np.ndarray((1, num_features),
                        buffer=np.array(range(1, num_features + 1)))
classes = np.ndarray(
    (1, num_features),
    buffer=np.array(["continuous" for i in range(num_features)]))
orange_data = np.concatenate((attributes, classes, data))

data_binned = orange.Preprocessor_discretize(orange_data,\
  method=orange.EquiNDiscretization(numberOfIntervals=num_bins)) #find cutoffs from orange

for i in range(num_features):
    cutoffs_string = str(data_binned.domain.attributes[i].getValueFrom.
                         transformer.points).lstrip('<').rstrip('>')
    bins = [float(ele) for ele in cutoffs_string.split(", ")]

    digitized = np.digitize(data[:, i], bins)
    data[:, i] = digitized

np.savetxt(out_file, data, fmt="%d", delimiter=",")
Exemple #13
0
    def computeDiscretizer(self, i, idx, onlyDefaults=False):
        attr = self.data.domain[idx]
        indiData = self.indiData[idx]

        discType, intervals = indiData[:2]
        discName = self.shortDiscNames[discType]

        defaultUsed = not discType

        if defaultUsed:
            discType = self.discretization+1
            intervals = self.intervals

        if discType >= self.D_N_METHODS + 1:

            try:
                customs = [float(r) for r in indiData[discType-self.D_N_METHODS+1]]
            except:
                customs = []

            if not customs:
                discType = self.discretization+1
                intervals = self.intervals
                discName = "%s ->%s)" % (self.shortDiscNames[indiData[0]][:-1], self.shortDiscNames[discType][2:-1])
                defaultUsed = True

        if onlyDefaults and not defaultUsed:
            return

        discType -= 1
        try:
            if discType == self.D_LEAVE: # leave continuous
                discretizer = None
            elif discType == self.D_ENTROPY:
                discretizer = orange.EntropyDiscretization(attr, self.data)
            elif discType == self.D_FREQUENCY:
                discretizer = orange.EquiNDiscretization(attr, self.data, numberOfIntervals = intervals)
            elif discType == self.D_WIDTH:
                discretizer = orange.EquiDistDiscretization(attr, self.data, numberOfIntervals = intervals)
            elif discType == self.D_REMOVE:
                discretizer = False
            else:
                discretizer = orange.IntervalDiscretizer(points = customs).constructVariable(attr)
        except:
            discretizer = False


        self.discretizers[idx] = discretizer

        if discType == self.D_LEAVE:
            discInts = ""
        elif discType == self.D_REMOVE:
            discInts = ""
        elif not discretizer:
            discInts = ": "+"<can't discretize>"
        else:
            points = discretizer.getValueFrom.transformer.points
            discInts = points and (": " + ", ".join([str(attr(x)) for x in points])) or ": "+"<removed>"
        self.indiLabels[i] = discInts + discName
        self.attrList.reset()

        if i == self.selectedAttr:
            self.graph.setSplits(discretizer and discretizer.getValueFrom.transformer.points or [])