def handle_languages(self): if self.corpus is not None: domain = self.corpus.domain if self.detect_languages: if self.corpus.languages is None: self.corpus.detect_languages() curr_attributes = list(domain.attributes) curr_class_var = [domain.class_var] if domain.class_var else [] curr_metas = list(domain.metas) curr_variables = curr_attributes + curr_class_var + curr_metas curr_names = [var.name for var in curr_variables] new_name = get_unique_names(curr_names, "Language") variable_attrs = {'language-feature': True} new_variable = StringVariable(new_name) new_variable.attributes.update(variable_attrs) new_domain = Domain(attributes=domain.attributes, class_vars=domain.class_var, metas=list(domain.metas) + [new_variable]) metas = np.hstack([ self.corpus.metas, np.array(self.corpus.languages).reshape(-1, 1) ]) self.corpus = Corpus(new_domain, self.corpus.X.copy(), self.corpus.Y.copy(), metas, self.corpus.W.copy(), copy(self.corpus.text_features)) else: lang_feat_idx = None for i, f in enumerate(domain.metas): if ('language-feature' in f.attributes and f.attributes['language-feature']): lang_feat_idx = i break if lang_feat_idx is not None: new_domain = Domain(attributes=domain.attributes, class_vars=domain.class_var, metas=list( np.delete(list(domain.metas), lang_feat_idx))) self.corpus = Corpus( new_domain, self.corpus.X.copy(), self.corpus.Y.copy(), np.delete(self.corpus.metas, lang_feat_idx, axis=1), self.corpus.W.copy(), copy(self.corpus.text_features)) self.Outputs.corpus.send(self.corpus)
def _rename_features( additional_names: List) -> Tuple[List, List, List]: cur_attr = list(self.domain.attributes) cur_class = self.domain.class_var cur_meta = list(self.domain.metas) if rename_existing: current_vars = (cur_attr + ([cur_class] if cur_class else []) + cur_meta) current_names = [a.name for a in current_vars] new_names = get_unique_names(additional_names, current_names, equal_numbers=False) renamed_vars = [ var.renamed(n) for var, n in zip(current_vars, new_names) ] cur_attr = renamed_vars[:len(cur_attr)] cur_class = renamed_vars[len(cur_attr)] if cur_class else None cur_meta = renamed_vars[-len(cur_meta):] return cur_attr, cur_class, cur_meta
def extend_attributes( self, X, feature_names, feature_values=None, compute_values=None, var_attrs=None, sparse=False, rename_existing=False ): """ Append features to corpus. If `feature_values` argument is present, features will be Discrete else Continuous. Args: X (numpy.ndarray or scipy.sparse.csr_matrix): Features values to append feature_names (list): List of string containing feature names feature_values (list): A list of possible values for Discrete features. compute_values (list): Compute values for corresponding features. var_attrs (dict): Additional attributes appended to variable.attributes. sparse (bool): Whether the features should be marked as sparse. rename_existing (bool): When true and names are not unique rename exiting features; if false rename new features """ def _rename_features(additional_names: List) -> Tuple[List, List, List]: cur_attr = list(self.domain.attributes) cur_class = self.domain.class_var cur_meta = list(self.domain.metas) if rename_existing: current_vars = ( cur_attr + ( [cur_class] if cur_class else []) + cur_meta ) current_names = [a.name for a in current_vars] new_names = get_unique_names( additional_names, current_names, equal_numbers=False ) renamed_vars = [ var.renamed(n) for var, n in zip(current_vars, new_names) ] cur_attr = renamed_vars[:len(cur_attr)] cur_class = renamed_vars[len(cur_attr)] if cur_class else None cur_meta = renamed_vars[-len(cur_meta):] return cur_attr, cur_class, cur_meta if sp.issparse(self.X) or sp.issparse(X): X = sp.hstack((self.X, X)).tocsr() else: X = np.hstack((self.X, X)) if compute_values is None: compute_values = [None] * X.shape[1] if feature_values is None: feature_values = [None] * X.shape[1] # rename existing variables if required curr_attributes, curr_class_var, curr_metas = _rename_features( feature_names ) if not rename_existing: # rename new feature names if required feature_names = get_unique_names( self.domain, feature_names, equal_numbers=False ) additional_attributes = [] for f, values, cv in zip(feature_names, feature_values, compute_values): if values is not None: var = DiscreteVariable(f, values=values, compute_value=cv) else: var = ContinuousVariable(f, compute_value=cv) var.sparse = sparse # don't pass this to constructor so this works with Orange < 3.8.0 if cv is not None: # set original variable for cv cv.variable = var if isinstance(var_attrs, dict): var.attributes.update(var_attrs) additional_attributes.append(var) new_domain = Domain( attributes=curr_attributes + additional_attributes, class_vars=curr_class_var, metas=curr_metas ) return Corpus( new_domain, X, self.Y.copy(), self.metas.copy(), self.W.copy(), copy(self.text_features) )