コード例 #1
0
    def handle_languages(self):
        if self.corpus is not None:
            domain = self.corpus.domain
            if self.detect_languages:
                if self.corpus.languages is None:
                    self.corpus.detect_languages()

                curr_attributes = list(domain.attributes)
                curr_class_var = [domain.class_var] if domain.class_var else []
                curr_metas = list(domain.metas)
                curr_variables = curr_attributes + curr_class_var + curr_metas
                curr_names = [var.name for var in curr_variables]
                new_name = get_unique_names(curr_names, "Language")

                variable_attrs = {'language-feature': True}
                new_variable = StringVariable(new_name)
                new_variable.attributes.update(variable_attrs)
                new_domain = Domain(attributes=domain.attributes,
                                    class_vars=domain.class_var,
                                    metas=list(domain.metas) + [new_variable])
                metas = np.hstack([
                    self.corpus.metas,
                    np.array(self.corpus.languages).reshape(-1, 1)
                ])
                self.corpus = Corpus(new_domain, self.corpus.X.copy(),
                                     self.corpus.Y.copy(), metas,
                                     self.corpus.W.copy(),
                                     copy(self.corpus.text_features))
            else:
                lang_feat_idx = None
                for i, f in enumerate(domain.metas):
                    if ('language-feature' in f.attributes
                            and f.attributes['language-feature']):
                        lang_feat_idx = i
                        break
                if lang_feat_idx is not None:
                    new_domain = Domain(attributes=domain.attributes,
                                        class_vars=domain.class_var,
                                        metas=list(
                                            np.delete(list(domain.metas),
                                                      lang_feat_idx)))
                    self.corpus = Corpus(
                        new_domain, self.corpus.X.copy(), self.corpus.Y.copy(),
                        np.delete(self.corpus.metas, lang_feat_idx, axis=1),
                        self.corpus.W.copy(), copy(self.corpus.text_features))
        self.Outputs.corpus.send(self.corpus)
コード例 #2
0
 def _rename_features(
         additional_names: List) -> Tuple[List, List, List]:
     cur_attr = list(self.domain.attributes)
     cur_class = self.domain.class_var
     cur_meta = list(self.domain.metas)
     if rename_existing:
         current_vars = (cur_attr + ([cur_class] if cur_class else []) +
                         cur_meta)
         current_names = [a.name for a in current_vars]
         new_names = get_unique_names(additional_names,
                                      current_names,
                                      equal_numbers=False)
         renamed_vars = [
             var.renamed(n) for var, n in zip(current_vars, new_names)
         ]
         cur_attr = renamed_vars[:len(cur_attr)]
         cur_class = renamed_vars[len(cur_attr)] if cur_class else None
         cur_meta = renamed_vars[-len(cur_meta):]
     return cur_attr, cur_class, cur_meta
コード例 #3
0
ファイル: corpus.py プロジェクト: scoobiii/orange3-text
    def extend_attributes(
            self, X, feature_names, feature_values=None, compute_values=None,
            var_attrs=None, sparse=False, rename_existing=False
        ):
        """
        Append features to corpus. If `feature_values` argument is present,
        features will be Discrete else Continuous.

        Args:
            X (numpy.ndarray or scipy.sparse.csr_matrix): Features values to append
            feature_names (list): List of string containing feature names
            feature_values (list): A list of possible values for Discrete features.
            compute_values (list): Compute values for corresponding features.
            var_attrs (dict): Additional attributes appended to variable.attributes.
            sparse (bool): Whether the features should be marked as sparse.
            rename_existing (bool): When true and names are not unique rename
                exiting features; if false rename new features
        """
        def _rename_features(additional_names: List) -> Tuple[List, List, List]:
            cur_attr = list(self.domain.attributes)
            cur_class = self.domain.class_var
            cur_meta = list(self.domain.metas)
            if rename_existing:
                current_vars = (
                        cur_attr + (
                    [cur_class] if cur_class else []) + cur_meta
                )
                current_names = [a.name for a in current_vars]
                new_names = get_unique_names(
                    additional_names, current_names, equal_numbers=False
                )
                renamed_vars = [
                    var.renamed(n) for var, n in zip(current_vars, new_names)
                ]
                cur_attr = renamed_vars[:len(cur_attr)]
                cur_class = renamed_vars[len(cur_attr)] if cur_class else None
                cur_meta = renamed_vars[-len(cur_meta):]
            return cur_attr, cur_class, cur_meta

        if sp.issparse(self.X) or sp.issparse(X):
            X = sp.hstack((self.X, X)).tocsr()
        else:
            X = np.hstack((self.X, X))

        if compute_values is None:
            compute_values = [None] * X.shape[1]
        if feature_values is None:
            feature_values = [None] * X.shape[1]

        # rename existing variables if required
        curr_attributes, curr_class_var, curr_metas = _rename_features(
            feature_names
        )
        if not rename_existing:
            # rename new feature names if required
            feature_names = get_unique_names(
                self.domain, feature_names, equal_numbers=False
            )

        additional_attributes = []
        for f, values, cv in zip(feature_names, feature_values, compute_values):
            if values is not None:
                var = DiscreteVariable(f, values=values, compute_value=cv)
            else:
                var = ContinuousVariable(f, compute_value=cv)
            var.sparse = sparse     # don't pass this to constructor so this works with Orange < 3.8.0
            if cv is not None:      # set original variable for cv
                cv.variable = var
            if isinstance(var_attrs, dict):
                var.attributes.update(var_attrs)
            additional_attributes.append(var)

        new_domain = Domain(
                attributes=curr_attributes + additional_attributes,
                class_vars=curr_class_var,
                metas=curr_metas
        )
        return Corpus(
            new_domain,
            X,
            self.Y.copy(),
            self.metas.copy(),
            self.W.copy(),
            copy(self.text_features)
        )