Esempio n. 1
0
def get_distributions_for_columns(data, columns):
    """Compute the distributions for columns.

    Parameters
    ----------
    data : data.Table
        List of column indices into the `data.domain` (indices can be
        :class:`int` or instances of `Orange.data.Variable`)

    """
    domain = data.domain
    # Normailze the columns to int indices
    columns = [
        col if isinstance(col, int) else domain.index(col) for col in columns
    ]
    try:
        # Try the optimized code path (query the table|storage directly).
        dist_unks = data._compute_distributions(columns)
    except NotImplementedError:
        # Use default slow(er) implementation.
        return [get_distribution(data, i) for i in columns]
    else:
        # dist_unkn is a list of (values, unknowns)
        return [
            get_distribution(dist, domain[col], unknown)
            for col, (dist, unknown) in zip(columns, dist_unks)
        ]
Esempio n. 2
0
    def from_data(cls, variable, data):
        variable = _get_variable(data, variable)
        try:
            dist, unknowns = data._compute_distributions([variable])[0]
        except NotImplementedError:
            col = data[:, variable]
            dtype = col.dtype
            if data.has_weights():
                if not "float" in dtype.name and "float" in col.dtype.name:
                    dtype = col.dtype.name
                dist = np.empty((2, len(col)), dtype=dtype)
                dist[0, :] = col
                dist[1, :] = data.W
            else:
                dist = np.ones((2, len(col)), dtype=dtype)
                dist[0, :] = col
            dist.sort(axis=0)
            dist = np.array(_orange.valuecount(dist))
            unknowns = len(col) - dist.shape[1]

        self = super().__new__(cls, dist.shape)
        self[:] = dist
        self.unknowns = unknowns
        self.variable = variable
        return self
Esempio n. 3
0
 def from_data(cls, data, variable):
     variable = _get_variable(data, variable)
     try:
         dist, unknowns = data._compute_distributions([variable])[0]
         self = super().__new__(cls, len(dist))
         self[:] = dist
         self.unknowns = unknowns
     except NotImplementedError:
         self = super().__new__(cls, len(variable.values))
         self[:] = np.zeros(len(variable.values))
         self.unknowns = 0
         if data.has_weights():
             for inst, w in zip(data, data.W):
                 val = inst[variable]
                 if not np.isnan(val):
                     self[int(val)] += w
                 else:
                     self.unknowns += w
         else:
             for inst in data:
                 val = inst[variable]
                 if val == val:
                     self[int(val)] += 1
                 else:
                     self.unknowns += 1
     self.variable = variable
     return self
Esempio n. 4
0
 def from_data(cls, data, variable):
     variable = _get_variable(data, variable)
     try:
         dist, unknowns = data._compute_distributions([variable])[0]
         self = super().__new__(cls, len(dist))
         self[:] = dist
         self.unknowns = unknowns
     except NotImplementedError:
         self = np.zeros(len(variable.values))
         self.unknowns = 0
         if data.has_weights():
             for val, w in zip(data[:, variable], data.W):
                 if not math.isnan(val):
                     self[val] += w
                 else:
                     self.unknowns += w
         else:
             for inst in data:
                 val = inst[variable]
                 if val == val:
                     self[val] += 1
                 else:
                     self.unknowns += 1
     self.variable = variable
     return self
Esempio n. 5
0
    def from_data(cls, variable, data):
        variable = _get_variable(data, variable)
        try:
            dist, unknowns = data._compute_distributions([variable])[0]
        except NotImplementedError:
            col = data[:, variable]
            dtype = col.dtype
            if data.has_weights():
                if not "float" in dtype.name and "float" in col.dtype.name:
                    dtype = col.dtype.name
                dist = np.empty((2, len(col)), dtype=dtype)
                dist[0, :] = col
                dist[1, :] = data.W
            else:
                dist = np.ones((2, len(col)), dtype=dtype)
                dist[0, :] = col
            dist.sort(axis=0)
            dist = np.array(_orange.valuecount(dist))
            unknowns = len(col) - dist.shape[1]

        self = super().__new__(cls, dist.shape)
        self[:] = dist
        self.unknowns = unknowns
        self.variable = variable
        return self
Esempio n. 6
0
def get_distributions_for_columns(data, columns):
    """Compute the distributions for columns.

    Parameters
    ----------
    data : data.Table
        List of column indices into the `data.domain` (indices can be
        :class:`int` or instances of `Orange.data.Variable`)

    """
    domain = data.domain
    # Normailze the columns to int indices
    columns = [col if isinstance(col, int) else domain.index(col) for col in columns]
    try:
        # Try the optimized code path (query the table|storage directly).
        dist_unks = data._compute_distributions(columns)
    except NotImplementedError:
        # Use default slow(er) implementation.
        return [get_distribution(data, i) for i in columns]
    else:
        # dist_unkn is a list of (values, unknowns)
        return [get_distribution(dist, domain[col], unknown)
                for col, (dist, unknown) in zip(columns, dist_unks)]