def __init__(self, X=None, Y=None, metas=None, domain=None, text_features=None): """ Args: X (numpy.ndarray): attributes Y (numpy.ndarray): class variables metas (numpy.ndarray): meta attributes; e.g. text domain (Orange.data.domain): the domain for this Corpus text_features (list): meta attributes that are used for text mining. Infer them if None. """ n_doc = _check_arrays(X, Y, metas) self.X = X if X is not None else np.zeros((n_doc, 0)) self.Y = Y if Y is not None else np.zeros((n_doc, 0)) self.metas = metas if metas is not None else np.zeros((n_doc, 0)) self.W = np.zeros((n_doc, 0)) self.domain = domain self.text_features = None # list of text features for mining self._tokens = None self.attributes = {} if domain is not None and text_features is None: self._infer_text_features() elif domain is not None: self.set_text_features(text_features) Table._init_ids(self)
def extend_corpus(self, metadata, Y): """ Append documents to corpus. Args: metadata (numpy.ndarray): Meta data Y (numpy.ndarray): Class variables """ if np.prod(self.X.shape) != 0: raise ValueError("Extending corpus only works when X is empty" "while the shape of X is {}".format(self.X.shape)) self.metas = np.vstack((self.metas, metadata)) cv = self.domain.class_var for val in set(filter(None, Y)): if val not in cv.values: cv.add_value(val) new_Y = np.array([cv.to_val(i) for i in Y])[:, None] self._Y = np.vstack((self._Y, new_Y)) self.X = self.W = np.zeros((self.metas.shape[0], 0)) Table._init_ids(self) self._tokens = None # invalidate tokens
def __init__(self, X=None, Y=None, metas=None, domain=None, text_features=None): """ Args: X (numpy.ndarray): attributes Y (numpy.ndarray): class variables metas (numpy.ndarray): meta attributes; e.g. text domain (Orange.data.domain): the domain for this Corpus text_features (list): meta attributes that are used for text mining. Infer them if None. """ n_doc = _check_arrays(X, Y, metas) self.X = X if X is not None else np.zeros((n_doc, 0)) self.Y = Y if Y is not None else np.zeros((n_doc, 0)) self.metas = metas if metas is not None else np.zeros((n_doc, 0)) self.W = np.zeros((n_doc, 0)) self.domain = domain self.text_features = None # list of text features for mining if domain is not None and text_features is None: self._infer_text_features() elif domain is not None: self.set_text_features(text_features) Table._init_ids(self)
def extend_corpus(self, metadata, Y): """ Append documents to corpus. Args: metadata (numpy.ndarray): Meta data Y (numpy.ndarray): Class variables """ if np.prod(self.X.shape) != 0: raise ValueError("Extending corpus only works when X is empty" "while the shape of X is {}".format(self.X.shape)) self.metas = np.vstack((self.metas, metadata)) cv = self.domain.class_var for val in set(filter(None, Y)): if val not in cv.values: cv.add_value(val) new_Y = np.array([cv.to_val(i) for i in Y])[:, None] self._Y = np.vstack((self._Y, new_Y)) self.X = self.W = np.zeros((self.metas.shape[0], 0)) Table._init_ids(self) self._tokens = None # invalidate tokens self._set_unique_titles()
def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, text_features=None): """ Args: domain (Orange.data.Domain): the domain for this Corpus X (numpy.ndarray): attributes Y (numpy.ndarray): class variables metas (numpy.ndarray): meta attributes; e.g. text W (numpy.ndarray): instance weights text_features (list): meta attributes that are used for text mining. Infer them if None. """ n_doc = _check_arrays(X, Y, metas) self.X = X if X is not None and X.size else sp.csr_matrix((n_doc, 0)) # prefer sparse (BoW compute values) self.Y = Y if Y is not None else np.zeros((n_doc, 0)) self.metas = metas if metas is not None else np.zeros((n_doc, 0)) self.W = W if W is not None else np.zeros((n_doc, 0)) self.domain = domain self.text_features = None # list of text features for mining self._tokens = None self._dictionary = None self._ngrams_corpus = None self.ngram_range = (1, 1) self.attributes = {} self.pos_tags = None self.used_preprocessor = None # required for compute values if domain is not None and text_features is None: self._infer_text_features() elif domain is not None: self.set_text_features(text_features) Table._init_ids(self)
def extend_corpus(self, metadata, Y): self.metas = np.vstack((self.metas, metadata)) cv = self.domain.class_var for val in set(Y): if val not in cv.values: cv.add_value(val) new_Y = np.array([cv.to_val(i) for i in Y])[:, None] self._Y = np.vstack((self._Y, new_Y)) self.X = self.W = np.zeros((len(self), 0)) Table._init_ids(self)
def extend_corpus(self, documents, metadata, class_values): # TODO check if Domains match! self.metas = np.vstack((self.metas, metadata)) self.documents += documents for val in set(class_values): if val not in self.domain.class_var.values: self.domain.class_var.add_value(val) new_Y = np.array([self.domain.class_var.to_val(cv) for cv in class_values])[:, None] self._Y = np.vstack((self._Y, new_Y)) self.X = self.W = np.zeros((len(self.documents), 0)) Table._init_ids(self)
def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, text_features=None, ids=None): """ Args: domain (Orange.data.Domain): the domain for this Corpus X (numpy.ndarray): attributes Y (numpy.ndarray): class variables metas (numpy.ndarray): meta attributes; e.g. text W (numpy.ndarray): instance weights text_features (list): meta attributes that are used for text mining. Infer them if None. ids (numpy.ndarray): Indices """ super().__init__() n_doc = _check_arrays(X, Y, metas) with self.unlocked_reference(): self.X = X if X is not None else np.zeros((n_doc, 0)) self.Y = Y if Y is not None else np.zeros((n_doc, 0)) self.metas = metas if metas is not None else np.zeros((n_doc, 0)) self.W = W if W is not None else np.zeros((n_doc, 0)) self.domain = domain self.text_features = [] # list of text features for mining self._tokens = None self._dictionary = None self._ngrams_corpus = None self.ngram_range = (1, 1) self.attributes = {} self._pos_tags = None from orangecontrib.text.preprocess import PreprocessorList self.__used_preprocessor = PreprocessorList( []) # required for compute values self._titles: Optional[np.ndarray] = None self._pp_documents = None # preprocessed documents if domain is not None and text_features is None: self._infer_text_features() elif domain is not None: self.set_text_features(text_features) if ids is not None: self.ids = ids else: Table._init_ids(self) self._set_unique_titles()
def __init__(self, documents, X, Y, metas, domain): self.documents = documents if X is not None: self.X = X else: self.X = np.zeros((len(documents), 0)) if Y is not None: self.Y = Y else: self.Y = np.zeros((len(documents), 0)) self.metas = metas self.W = np.zeros((len(documents), 0)) self.domain = domain Table._init_ids(self)
def extend_corpus(self, documents, metadata, class_values, meta_vars): # TODO check if Domains match! self.metas = np.vstack((self.metas, metadata)) self.documents += documents for val in set(class_values): if val not in self.domain.class_var.values: self.domain.class_var.add_value(val) new_Y = np.array([self.domain.class_var.to_val(cv) for cv in class_values])[:, None] new_Y[np.isnan(new_Y)] = 0 self._Y = np.vstack((self._Y, new_Y)) self.X = self.W = np.zeros((len(self.documents), 0)) Table._init_ids(self)
def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, text_features=None, ids=None): """ Args: domain (Orange.data.Domain): the domain for this Corpus X (numpy.ndarray): attributes Y (numpy.ndarray): class variables metas (numpy.ndarray): meta attributes; e.g. text W (numpy.ndarray): instance weights text_features (list): meta attributes that are used for text mining. Infer them if None. ids (numpy.ndarray): Indices """ n_doc = _check_arrays(X, Y, metas) self.X = X if X is not None else sp.csr_matrix( (n_doc, 0)) # prefer sparse (BoW compute values) self.Y = Y if Y is not None else np.zeros((n_doc, 0)) self.metas = metas if metas is not None else np.zeros((n_doc, 0)) self.W = W if W is not None else np.zeros((n_doc, 0)) self.domain = domain self.text_features = None # list of text features for mining self._tokens = None self._dictionary = None self._ngrams_corpus = None self.ngram_range = (1, 1) self.attributes = {} self.pos_tags = None self.used_preprocessor = None # required for compute values if domain is not None and text_features is None: self._infer_text_features() elif domain is not None: self.set_text_features(text_features) if ids is not None: self.ids = ids else: Table._init_ids(self)
def __init__(self, relation): """Create a wrapper for fusion.Relation. Parameters: ----------- relation: An instance of `skfusion.fusion.Relation` """ self.relation = relation meta_vars, self.metas = self._create_metas(relation) self._Y = self.W = np.zeros((len(relation.data), 0)) if relation.col_names is not None: attr_names = relation.col_names else: attr_names = range(relation.data.shape[1]) self.domain = Domain([ContinuousVariable(name) for name in map(str, attr_names)], metas=meta_vars) Table._init_ids(self)
def extend_corpus(self, metadata, Y): """ Append documents to corpus. Args: metadata (numpy.ndarray): Meta data Y (numpy.ndarray): Class variables """ self.metas = np.vstack((self.metas, metadata)) cv = self.domain.class_var for val in set(Y): if val not in cv.values: cv.add_value(val) new_Y = np.array([cv.to_val(i) for i in Y])[:, None] self._Y = np.vstack((self._Y, new_Y)) self.X = self.W = np.zeros((len(self), 0)) Table._init_ids(self) self._tokens = None # invalidate tokens