Ejemplo n.º 1
0
def standardizeData(dataMatrix):
    """Performs standardization of data along rows. Throws error if constant
    variable is present."""
    scale = numpy.std(dataMatrix, axis = 0)
    if 0. in scale:
        raise orange.KernelException("Constant variable, cannot standardize!")
    return scale, dataMatrix * 1./scale
Ejemplo n.º 2
0
    def __call__(self, dataset):
                
        try:
            #retain class attribute
            attrDataset = dataset.select(self.domain)
            imputer = self.imputer(attrDataset)
            attrDataset = imputer(attrDataset)
            domain = self.continuizer(attrDataset)
            attrDataset = attrDataset.translate(domain)
        except TypeError as e:
            raise orange.KernelException("One or more attributes form training set are missing!")

        dataMatrix, classArray, x = attrDataset.toNumpy()

        dataMatrix -= self.center
        if self.deviation != None:
            dataMatrix *= 1./self.deviation
            
        #save transformed data
        self._dataMatrix = numpy.dot(dataMatrix, self.loadings)

        attributes = [orange.FloatVariable("PC%d" % (i + 1, )) for i in range(len(self.evalues))]
        new_domain = orange.Domain(attributes)
        new_table = orange.ExampleTable(new_domain, self._dataMatrix)

        if dataset.domain.classVar:
            #suboptimal
            classTable = dataset.select([dataset.domain.classVar.name])
            self._classArray = numpy.array([row.getclass() for row in classTable])
            new_table = orange.ExampleTable([new_table, classTable])
        
        return new_table
Ejemplo n.º 3
0
 def plot(self, title = 'Scree plot', filename = 'scree_plot.png'):
     """
     Draws a scree plot. Matplotlib is needed.
     
     Parameters
     ----------
     title : Title of the plot
     filename : File name under which plot will be saved (default: scree_plot.png)
         If None, plot will be shown
     """
     
     if not mathlib_import:
         raise orange.KernelException("Matplotlib was not imported!")
     
     #plt.clf() -> opens two windows
     fig = plt.figure()
     ax = fig.add_subplot(111)
     
     x_axis = list(range(len(self.evalues)))
     x_labels = ["PC%d" % (i + 1, ) for i in x_axis]
     
     ax.set_xticks(x_axis)
     ax.set_xticklabels(x_labels)
     ax.set_xlabel('Principal components')
     ax.set_ylabel('Variance')
     ax.set_title(title + "\n")
     ax.bar(left = x_axis, height = self.evalues, align = 'center')
     ax.axis([-0.5, len(self.evalues) - 0.5, 0, self.evalues[0]*1.05])
     if filename:
         plt.savefig(filename)
     else:
         plt.show()
Ejemplo n.º 4
0
def discretizeDomain(data, removeUnusedValues=1, numberOfIntervals=2):
    entroDisc = orange.EntropyDiscretization()
    equiDisc = orange.EquiNDiscretization(numberOfIntervals=numberOfIntervals)
    discAttrs = []

    className = data and len(
        data
    ) > 0 and data.domain.classVar and data.domain.classVar.name or None
    #    if className:
    #        data = data.filterref(orange.Filter_hasClassValue())  # remove examples with missing classes

    if not data or len(data) == 0:
        return None

    # if we have a continuous class we have to discretize it before we can discretize the attributes
    if className and data.domain.classVar.varType == orange.VarTypes.Continuous:
        try:
            newClass = equiDisc(data.domain.classVar.name, data)
            newClass.name = className
        except orange.KernelException as ex:
            warnings.warn("Could not discretize class variable '%s'. %s" %
                          (data.domain.classVar.name, ex.message))
            newClass = None
            className = None
        newDomain = orange.Domain(data.domain.attributes, newClass)
        data = orange.ExampleTable(newDomain, data)

    for attr in data.domain.attributes:
        try:
            name = attr.name
            if attr.varType == orange.VarTypes.Continuous:  # if continuous attribute then use entropy discretization
                if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
                    new_attr = entroDisc(attr, data)
                else:
                    new_attr = equiDisc(attr, data)
            else:
                new_attr = attr
            if removeUnusedValues:
                new_attr = orange.RemoveUnusedValues(new_attr, data)
                if new_attr is None:
                    raise orange.KernelException("No values")

            new_attr.name = name
            discAttrs.append(new_attr)
        except orange.KernelException as ex:  # if all values are missing, entropy discretization will throw an exception. in such cases ignore the attribute
            warnings.warn("Could not discretize %s attribute. %s" %
                          (attr.name, ex.message))

    if className: discAttrs.append(data.domain.classVar)
    d2 = data.translate(discAttrs, True)
    return d2
Ejemplo n.º 5
0
 def biplot(self, choices = [1,2], scale = 1., title = 'Biplot',
            filename = 'biplot.png'):
     """
     Draws biplot for PCA. Matplotlib is needed. Actual projection must be
     performed via pca(data) before bilpot can be used.
     
     Parameters
     ----------
     choices : lenght 2 list-like object for choosing which 2 components
         should be used in biplot. Default is first and second
     scale : scale factor (default is 1.). Should be inside [0, 1]
     title : title of biplot
     filename : File name under which plot will be saved (default: biplot.png)
         If None, plot will be shown
     """
     
     if not mathlib_import:
         raise orange.KernelException("Matplotlib was not imported!")
     
     if self._dataMatrix == None:
         raise orange.KernelException("No data available for biplot!")
     
     if len(choices) != 2:
         raise orange.KernelException('You have to choose exactly two components')
     
     if max(choices[0], choices[1]) > len(self.evalues) or min(choices[0], choices[1]) < 1:
         raise orange.KernelException('Invalid choices')
     
     choice = numpy.array([i == choices[0] - 1 or i == choices[1] - 1 for i in range(len(self.evalues))])
     
     dataMatrix = self._dataMatrix[:,choice]
     loadings = self.loadings[:,choice]
     lam = (numpy.array(self.evalues)[choice])
     lam *= sqrt(len(self._dataMatrix))
     
     if scale < 0. or scale > 1.:
         print("Warning: 'scale' is outside [0, 1]")
     lam = lam**scale
     
     #TO DO -> pc.biplot (maybe)
     trDataMatrix = dataMatrix / lam
     trLoadings = loadings * lam
     
     max_data_value = numpy.max(abs(trDataMatrix)) * 1.05
     max_load_value = numpy.max(abs(trLoadings)) * 1.5
     
     #plt.clf()
     fig = plt.figure()
     ax1 = fig.add_subplot(111)
     ax1.set_title(title + "\n")
     ax1.set_xlabel("PC%s" % (choices[0]))
     ax1.set_ylabel("PC%s" % (choices[1]))
     ax1.xaxis.set_label_position('bottom')
     ax1.xaxis.set_ticks_position('bottom')
     ax1.yaxis.set_label_position('left')
     ax1.yaxis.set_ticks_position('left')        
     
     if self._classArray == None:
         trDataMatrix = transpose(trDataMatrix)
         ax1.plot(trDataMatrix[0], trDataMatrix[1], Colors[0])
     else:
         #suboptimal
         classValues = []
         for classValue in self._classArray:
             if classValue not in classValues:
                 classValues.append(classValue)
         for i in range(len(classValues)):
             choice = numpy.array([classValues[i] == cv for cv in self._classArray])
             partialDataMatrix = transpose(trDataMatrix[choice])
             ax1.plot(partialDataMatrix[0], partialDataMatrix[1],
                      Colors[i % len(Colors)], label = str(classValues[i]))
         ax1.legend()
     
     ax1.set_xlim(-max_data_value, max_data_value)
     ax1.set_ylim(-max_data_value, max_data_value)
     
     #eliminate double axis on right
     ax0 = ax1.twinx()
     ax0.yaxis.set_visible(False)
             
     ax2 = ax0.twiny()
     ax2.xaxis.set_label_position('top')
     ax2.xaxis.set_ticks_position('top')
     ax2.yaxis.set_label_position('right')
     ax2.yaxis.set_ticks_position('right')
     for tl in ax2.get_xticklabels():
         tl.set_color('r')
     for tl in ax2.get_yticklabels():
         tl.set_color('r')
     
     arrowprops = dict(facecolor = 'red', edgecolor = 'red', width = 1, headwidth = 4)
     #using annotations instead of arrows because there is a strange implementation
     #of arrows in matplotlib version 0.99
     for i in range(len(trLoadings)):
         x, y = trLoadings[i]
         ax2.annotate('', (x, y), (0, 0), arrowprops = arrowprops)
         ax2.text(x * 1.1, y * 1.2, self.domain[i], color = 'red')
         
     ax2.set_xlim(-max_load_value, max_load_value)
     ax2.set_ylim(-max_load_value, max_load_value)
     
     if filename:
         plt.savefig(filename)
     else:
         plt.show()