def select(data, rows, domain): """ Select the data subset with specified rows and domain subsets. If either rows or domain is None they mean select all. """ if rows is not None and domain is not None: return data.from_table(domain, data, rows) elif rows is not None: return data.from_table(data.domain, rows) elif domain is not None: return data.from_table(domain, data) else: return data
def __call__(self, data): """ Compute and apply discretization of the given data. Returns a new data table. :param data: data :type data: Orange.data.Table :return: Orange.data.Table """ def transform(var): if is_continuous(var): newvar = method(data, var) if newvar is not None and len(newvar.values) >= 2: return newvar else: return None else: return var method = self.method or discretize.EqualFreq() newattrs = [transform(var) for var in data.domain.attributes] newattrs = [var for var in newattrs if var is not None] domain = Orange.data.Domain( newattrs, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): """ Compute and apply discretization of the given data. Returns a new data table. Parameters ---------- data : Orange.data.Table A data table to be discretized. """ def transform(var): if var.is_continuous: new_var = method(data, var) if new_var is not None and \ (len(new_var.values) >= 2 or not self.remove_const): return new_var else: return None else: return var method = self.method or discretize.EqualFreq() attributes = [transform(var) for var in data.domain.attributes] attributes = [var for var in attributes if var is not None] domain = Orange.data.Domain( attributes, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): common = _GaussianCommon(self.sd, data.domain) atts = [a.copy(compute_value=GaussianFeature(i, common)) for i, a in enumerate(data.domain.attributes)] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) xs, xsind, mon, X = _transform_to_sorted_features(data) x = xs[xsind] newd = np.zeros_like(data.X) for rowi, row in enumerate(X): # remove NaNs which ConvexHull can not handle source = np.column_stack((x, row)) source = source[~np.isnan(source).any(axis=1)] try: v = ConvexHull(source).vertices except QhullError: # FIXME notify user baseline = np.zeros_like(row) else: if self.peak_dir == RubberbandBaseline.PeakPositive: v = np.roll(v, -v.argmin()) v = v[:v.argmax() + 1] elif self.peak_dir == RubberbandBaseline.PeakNegative: v = np.roll(v, -v.argmax()) v = v[:v.argmin() + 1] # If there are NaN values at the edges of data then convex hull # does not include the endpoints. Because the same values are also # NaN in the current row, we can fill them with NaN (bounds_error # achieves this). baseline = interp1d(source[v, 0], source[v, 1], bounds_error=False)(x) finally: if self.sub == 0: newd[rowi] = row - baseline else: newd[rowi] = baseline return _transform_back_to_features(xsind, mon, newd)
def __call__(self, data): if self.center is None and self.scale is None: return data def transform(var): dist = distribution.get_distribution(data, var) if self.center != self.NoCentering: c = self.center(dist) dist[0, :] -= c else: c = 0 if self.scale != self.NoScaling: s = self.scale(dist) if s < 1e-15: s = 1 else: s = 1 factor = 1 / s transformed_var = var.copy( compute_value=transformation.Normalizer(var, c, factor)) if s != 1: transformed_var.number_of_decimals = 3 return transformed_var newvars = [] for var in data.domain.attributes: if var.is_continuous: newvars.append(transform(var)) else: newvars.append(var) domain = Orange.data.Domain(newvars, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) if data.X.shape[0] == 0: return data.X data = data.copy() if self.method == Normalize.Vector: nans = np.isnan(data.X) nan_num = nans.sum(axis=1, keepdims=True) ys = data.X if np.any(nan_num > 0): # interpolate nan elements for normalization x = getx(data) ys = interp1d_with_unknowns_numpy(x, ys, x) ys = np.nan_to_num(ys) # edge elements can still be zero data.X = sknormalize(ys, norm='l2', axis=1, copy=False) if np.any(nan_num > 0): # keep nans where they were data.X[nans] = float("nan") elif self.method == Normalize.Area: norm_data = Integrate(methods=self.int_method, limits=[[self.lower, self.upper]])(data) data.X /= norm_data.X elif self.method == Normalize.Attribute: if self.attr in data.domain and isinstance( data.domain[self.attr], Orange.data.ContinuousVariable): ndom = Orange.data.Domain([data.domain[self.attr]]) factors = data.transform(ndom) data.X /= factors.X nd = data.domain[self.attr] else: # invalid attribute for normalization data.X *= float("nan") return data.X
def __call__(self, data): common = _IntegrateCommon(data.domain) atts = [] if self.limits: methods = self.methods if not isinstance(methods, Iterable): methods = [methods] * len(self.limits) names = self.names if not names: names = [ " - ".join("{0}".format(e) for e in l) for l in self.limits ] # no names in data should be repeated used_names = [ var.name for var in data.domain.variables + data.domain.metas ] for i, n in enumerate(names): n = get_next_name(used_names, n) names[i] = n used_names.append(n) for limits, method, name in zip(self.limits, methods, names): atts.append( Orange.data.ContinuousVariable(name=name, compute_value=method( limits, common))) if not self.metas: domain = Orange.data.Domain(atts, data.domain.class_vars, metas=data.domain.metas) else: domain = Orange.data.Domain(data.domain.attributes, data.domain.class_vars, metas=data.domain.metas + tuple(atts)) return data.from_table(domain, data)
def __call__(self, data): if self.center is None and self.scale is None: return data def transform(var): dist = distribution.get_distribution(data, var) if self.center: c = self.center(dist) dist[0, :] -= c else: c = 0 if self.scale: s = self.scale(dist) if s < 1e-15: s = 1 else: s = 1 factor = 1 / s return var.copy(compute_value=preprocess.transformation.Normalizer( var, c, factor)) newvars = [] for var in data.domain.attributes: if var.is_continuous: newvars.append(transform(var)) else: newvars.append(var) domain = Orange.data.Domain(newvars, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): dc = continuizer.DomainContinuizer( zero_based=self.zero_based, multinomial_treatment=self.multinimial_treatment, ) domain = dc(data) return data.from_table(domain, data)
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) if data.X.shape[0] == 0: return data.X data = data.copy() if self.method == Normalize.Vector: nans = np.isnan(data.X) nan_num = nans.sum(axis=1, keepdims=True) ys = data.X if np.any(nan_num > 0): # interpolate nan elements for normalization x = getx(data) ys = interp1d_with_unknowns_numpy(x, ys, x) ys = np.nan_to_num(ys) # edge elements can still be zero data.X = sknormalize(ys, norm='l2', axis=1, copy=False) if np.any(nan_num > 0): # keep nans where they were data.X[nans] = float("nan") elif self.method == Normalize.Area: norm_data = Integrate(method=self.int_method, limits=[[self.lower, self.upper]])(data) data.X /= norm_data.X elif self.method == Normalize.Attribute: # attr normalization applies to entire spectrum, regardless of limits # meta indices are -ve and start at -1 if self.attr not in (None, "None", ""): attr_index = -1 - data.domain.index(self.attr) factors = data.metas[:, attr_index].astype(float) data.X /= factors[:, None] return data.X
def __call__(self, data): common = _CurveShiftCommon(self.amount, data.domain) atts = [a.copy(compute_value=CurveShiftFeature(i, common)) for i, a in enumerate(data.domain.attributes)] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): dc = continuize.DomainContinuizer( zero_based=self.zero_based, multinomial_treatment=self.multinimial_treatment, ) domain = dc(data) return data.from_table(domain, data)
def __call__(self, data): common = _LinearBaselineCommon(self.peak_dir, self.sub, data.domain) atts = [a.copy(compute_value=LinearBaselineFeature(i, common)) for i, a in enumerate(data.domain.attributes)] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): common = _SavitzkyGolayCommon(self.window, self.polyorder, self.deriv, data.domain) atts = [a.copy(compute_value=SavitzkyGolayFeature(i, common)) for i, a in enumerate(data.domain.attributes)] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): common = _NormalizeCommon(self.method, self.lower, self.upper, self.int_method, self.attr, data.domain) atts = [a.copy(compute_value=NormalizeFeature(i, common)) for i, a in enumerate(data.domain.attributes)] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): common = _LinearBaselineCommon(self.peak_dir, self.sub, self.zero_points, data.domain) atts = [a.copy(compute_value=LinearBaselineFeature(i, common)) for i, a in enumerate(data.domain.attributes)] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): from . import continuize continuizer = continuize.DomainContinuizer( zero_based=self.zero_based, multinomial_treatment=self.multinomial_treatment) domain = continuizer(data) return data.from_table(domain, data)
def __call__(self, data): common = _SavitzkyGolayCommon(self.window, self.polyorder, self.deriv, data.domain) atts = [ a.copy(compute_value=SavitzkyGolayFeature(i, common)) for i,a in enumerate(data.domain.attributes) ] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data, variable): variable = data.domain[variable] domain = domain_with_class_var(data.domain, variable) data = data.from_table(domain, data) model = self.learner(data) assert model.domain.class_var == variable return variable.copy( compute_value=ReplaceUnknownsModel(variable, model))
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) return savgol_filter(data.X, window_length=self.window, polyorder=self.polyorder, deriv=self.deriv, mode="nearest")
def __call__(self, data): common = self._cl_common(self.reference, data.domain) newattrs = [Orange.data.ContinuousVariable( name=var.name, compute_value=self._cl_feature(i, common)) for i, var in enumerate(data.domain.attributes)] domain = Orange.data.Domain( newattrs, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def commit(self): if not self.data: self.send("Normalized expression array", None) self.send("Filtered expression array", None) return G, R = self.merged_splits Gc, Rc = self.centered ind1, ind2 = self.split_ind gfactor = Gc / G domain = self.data.domain newmetas = [] M = [] _, _, axis = self.getSelectedGroup() if self.appendZScore and axis == 1: attr = Orange.data.ContinuousVariable("Z-Score") newmetas.append(attr) M.append(self.z_scores.filled(numpy.nan)) if self.appendRIValues and axis == 1: r_attr = Orange.data.ContinuousVariable("Log Ratio") i_attr = Orange.data.ContinuousVariable("Intensity") ratio, intensity = expression.ratio_intensity(Gc, Rc) newmetas.extend([r_attr, i_attr]) M.extend([ratio.filled(numpy.nan), intensity.filled(numpy.nan)]) if newmetas: domain = Orange.data.Domain( self.data.domain.attributes, self.data.domain.class_vars, self.data.domain.metas + tuple(newmetas)) data = Orange.data.Table.from_table(domain, self.data) data.ensure_copy() if axis == 0: data.X[ind1, :] *= gfactor.reshape((1, -1)) else: data.X[:, ind1] *= gfactor.reshape((-1, 1)) for i, mcol in enumerate(reversed(M)): data.metas[:, -i - 1] = mcol selected_indices = numpy.flatnonzero( numpy.abs(self.z_scores.filled(0)) >= self.zCutoff) if axis == 0: attrs = [data.domain[i] for i in selected_indices] domain = Orange.data.Domain(attrs, data.domain.class_vars, data.domain.metas) filtered_data = data.from_table(domain, data) else: filtered_data = data[selected_indices] self.send("Normalized expression array", data) self.send("Filtered expression array", filtered_data)
def test_meta_object_dtype(self): # gh-1875: Test on mixed string/discrete metas data = self.data[::5] domain = Orange.data.Domain( data.domain.attributes, [], [data.domain["iris"], Orange.data.StringVariable("S")]) data = data.from_table(domain, data) self.send_signal(self.widget.Inputs.data, data)
def __call__(self, data): atts = features_with_interpolation(self.points, self.kind, data.domain, self.handle_nans, interpfn=self.interpfn) domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) xs, xsind, mon, X = _transform_to_sorted_features(data) X, nans = _nan_extend_edges_and_interpolate(xs[xsind], X) X = gaussian_filter1d(X, sigma=self.sd, mode="nearest") if nans is not None: X[nans] = np.nan return _transform_back_to_features(xsind, mon, X)
def __call__(self, data): common = ARPLSCommon(self.lam, self.itermax, self.ratio, data.domain) atts = [ a.copy(compute_value=ARPLSFeature(i, common)) for i, a in enumerate(data.domain.attributes) ] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def partial_fit(self, data): if isinstance(data, Orange.data.Storage): if data.domain != self.pre_domain: data = data.from_table(self.pre_domain, data) self.proj.partial_fit(data.X) else: self.proj.partial_fit(data) self.__dict__.update(self.proj.__dict__) return self
def test_meta_object_dtype(self): # gh-1875: Test on mixed string/discrete metas data = self.data[::5] domain = Orange.data.Domain( data.domain.attributes, [], [data.domain["iris"], Orange.data.StringVariable("S")] ) data = data.from_table(domain, data) self.send_signal(self.widget.Inputs.data, data)
def __call__(self, data): common = _DespikeCommon(self.threshold, self.cutoff, self.dis, data.domain) atts = [ a.copy(compute_value=DespikeFeature(i, common)) for i, a in enumerate(data.domain.attributes) ] domain = Orange.data.Domain(atts, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): if data.domain != self.pca.pre_domain: data = data.from_table(self.pca.pre_domain, data) pca_space = self.pca.transform(data.X) if self.components is not None: #set unused components to zero remove = np.ones(pca_space.shape[1]) remove[self.components] = 0 remove = np.extract(remove, np.arange(pca_space.shape[1])) pca_space[:, remove] = 0 return self.pca.proj.inverse_transform(pca_space)
def __call__(self, data, variable): variable = data.domain[variable] domain = domain_with_class_var(data.domain, variable) if self.learner.check_learner_adequacy(domain): data = data.from_table(domain, data) model = self.learner(data) assert model.domain.class_var == variable return variable.copy(compute_value=ReplaceUnknownsModel(variable, model)) else: raise ValueError("`{}` doesn't support domain type".format(self.learner.name))
def __call__(self, data): x = getx(data) if not self.inverse: okattrs = [at for at, v in zip(data.domain.attributes, x) if (self.lowlim is None or self.lowlim <= v) and (self.highlim is None or v <= self.highlim)] else: okattrs = [at for at, v in zip(data.domain.attributes, x) if (self.lowlim is not None and v <= self.lowlim) or (self.highlim is not None and self.highlim <= v)] domain = Orange.data.Domain(okattrs, data.domain.class_vars, metas=data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) if self.ref: # Calculate from single-channel data absd = self.ref.X / data.X np.log10(absd, absd) else: # Calculate from transmittance data absd = np.log10(data.X) absd *= -1 return absd
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) xs, xsind, mon, X = _transform_to_sorted_features(data) X, nans = _nan_extend_edges_and_interpolate(xs[xsind], X) X = savgol_filter(X, window_length=self.window, polyorder=self.polyorder, deriv=self.deriv, mode="nearest") # set NaNs where there were NaNs in the original array if nans is not None: X[nans] = np.nan return _transform_back_to_features(xsind, mon, X)
def __call__(self, data): if data.domain != self.domain: data = data.from_table(self.domain, data) if self.ref: # Calculate from single-channel data transd = data.X / self.ref.X else: # Calculate from absorbance data transd = data.X.copy() transd *= -1 np.power(10, transd, transd) return transd
def __call__(self, data): common = _XASnormalizationCommon(self.edge, self.preedge_params, self.postedge_params, data.domain) newattrs = [ContinuousVariable(name=var.name, compute_value=XASnormalizationFeature(i, common)) for i, var in enumerate(data.domain.attributes)] newmetas = data.domain.metas + (ContinuousVariable( name='edge_jump', compute_value=XASnormalizationFeature(len(newattrs), common)),) domain = Orange.data.Domain( newattrs, data.domain.class_vars, newmetas) return data.from_table(domain, data)
def __call__(self, data): """ Apply an imputation method to the given data set. Returns a new data table with missing values replaced by their imputations. Parameters ---------- data : Orange.data.Table An input data table. """ method = self.method or impute.Average() newattrs = [method(data, var) for var in data.domain.attributes] domain = Orange.data.Domain(newattrs, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): if data and len(data.domain.attributes): maxpca = min(len(data.domain.attributes), len(data)) pca = Orange.projection.PCA(n_components=min(maxpca, self.components))(data) commonfn = _PCAReconstructCommon(pca) nats = [at.copy(compute_value=PCADenoisingFeature(i, commonfn)) for i, at in enumerate(data.domain.attributes)] else: # FIXME we should have a warning here nats = [ at.copy() for at in data.domain.attributes ] # unknown values domain = Orange.data.Domain(nats, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): def transform(var): if is_continuous(var): newvar = self.method(data, var) if newvar is not None and len(newvar.values) >= 2: return newvar else: return None else: return var newattrs = [transform(var) for var in data.domain.attributes] newattrs = [var for var in newattrs if var is not None] domain = Orange.data.Domain( newattrs, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): # convert to data domain if any conversion is possible, # otherwise we use the interpolator directly to make domains compatible if self.domain is not None and data.domain != self.domain \ and any(at.compute_value for at in self.domain.attributes): data = data.from_table(self.domain, data) x = getx(data) # removing whole NaN columns from the data will effectively replace # NaNs that are not on the edges with interpolated values ys = data.X if self.handle_nans: x, ys = remove_whole_nan_ys(x, ys) # relatively fast if len(x) == 0: return np.ones((len(data), len(self.points)))*np.nan interpfn = self.interpfn if interpfn is None: if self.handle_nans and np.isnan(ys).any(): if self.kind == "linear": interpfn = interp1d_with_unknowns_numpy else: interpfn = interp1d_with_unknowns_scipy else: interpfn = interp1d_wo_unknowns_scipy return interpfn(x, ys, self.points, kind=self.kind)
def __call__(self, data, ret=Value): if not 0 <= ret <= 2: raise ValueError("invalid value of argument 'ret'") if (ret > 0 and any(isinstance(v, Orange.data.ContinuousVariable) for v in self.domain.class_vars)): raise ValueError("cannot predict continuous distributions") # Call the predictor if isinstance(data, np.ndarray): prediction = self.predict(np.atleast_2d(data)) elif isinstance(data, scipy.sparse.csr.csr_matrix): prediction = self.predict(data) elif isinstance(data, Orange.data.Instance): if data.domain != self.domain: data = Orange.data.Instance(self.domain, data) prediction = self.predict_storage(data) elif isinstance(data, Orange.data.Table): if data.domain != self.domain: data = data.from_table(self.domain, data) prediction = self.predict_storage(data) else: raise TypeError("Unrecognized argument (instance of '{}')".format( type(data).__name__)) # Parse the result into value and probs multitarget = len(self.domain.class_vars) > 1 if isinstance(prediction, tuple): value, probs = prediction elif prediction.ndim == 1 + multitarget: value, probs = prediction, None elif prediction.ndim == 2 + multitarget: value, probs = None, prediction else: raise TypeError("model returned a %i-dimensional array", prediction.ndim) # Ensure that we have what we need to return if ret != Model.Probs and value is None: value = np.argmax(probs, axis=-1) if ret != Model.Value and probs is None: if multitarget: max_card = max(len(c.values) for c in self.domain.class_vars) probs = np.zeros(value.shape + (max_card,), float) for i, cvar in enumerate(self.domain.class_vars): probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]), max_card - 1) else: probs, _ = bn.bincount(np.atleast_2d(value), len(self.domain.class_var.values) - 1) if ret == Model.ValueProbs: return value, probs else: return probs # Return what we need to if ret == Model.Probs: return probs if isinstance(data, Orange.data.Instance) and not multitarget: value = Orange.data.Value(self.domain.class_var, value[0]) if ret == Model.Value: return value else: # ret == Model.ValueProbs return value, probs
def __call__(self, data): return data.from_table(self.domain, data)
def __call__(self, data): newattrs = [self.method(data, var) for var in data.domain.attributes] domain = Orange.data.Domain( newattrs, data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)
def __call__(self, data): if data.domain != self.projection.pre_domain: data = data.from_table(self.projection.pre_domain, data) return self.projection.transform(data.X)[:, self.feature]
def __call__(self, data): if data.domain != self.pca.pre_domain: data = data.from_table(self.pca.pre_domain, data) return self.pca.transform(data.X)
def append_columns(data, attributes=(), class_vars=(), metas=()): # type: (Orange.data.Table, ColSpec, ColSpec, ColSpec) -> Orange.data.Table """ Append a set of columns to a data table. Parameters ---------- data : Orange.data.Table Primary table. attributes : Sequence[Tuple[Orange.data.Variable], Sequence[float]] A Sequence of variable and column data tuples to append to the `data`. class_vars : Sequence[Tuple[Orange.data.Variable], Sequence[float]] A Sequence of variable and column data tuples to append to the `data` in the metas : Sequence[Tuple[Orange.data.Variable], Sequence[float]] A Sequence of variable and column data tuples to append to the `data` Returns ------- data : Orange.data.Table A copy of the original `data` input extended with all columns from `attributes`, `class_vars`, `metas` parameters Note ---- All variables in the original and new columns should be distinct. """ domain = data.domain new_attributes = tuple(map(itemgetter(0), attributes)) new_class_vars = tuple(map(itemgetter(0), class_vars)) new_metas = tuple(map(itemgetter(0), metas)) new_domain = Orange.data.Domain( domain.attributes + new_attributes, domain.class_vars + new_class_vars, domain.metas + new_metas ) def ascolumn(array, n): # type: (Sequence[float], int) -> numpy.ndarray array = numpy.asarray(array) if array.ndim < 2: array = array.reshape((n, 1)) return array N = len(data) attr_cols = [ascolumn(col, N) for _, col in attributes] class_cols = [ascolumn(col, N) for _, col in class_vars] meta_cols = [ascolumn(col, N) for _, col in metas] new_data = data.from_table(new_domain, data) for i, (var, col) in enumerate(zip(new_attributes, attr_cols), start=len(domain.attributes)): assert new_data.domain.attributes[i] is var new_data.X[:, i] = col.ravel() for i, (var, col) in enumerate(zip(new_class_vars, class_cols), start=len(domain.class_vars)): assert new_data.domain.class_vars[i] is var new_data._Y[:, i] = col.ravel() for i, (var, col) in enumerate(zip(new_metas, meta_cols), start=len(domain.metas)): assert new_data.domain.metas[i] is var new_data.metas[:, i] = col.ravel() return new_data