def _setup_corpus(self, text_features: List[Variable] = None) -> None: """ Parameters ---------- text_features meta attributes that are used for text mining. Infer them if None. """ self.text_features = [] # list of text features for mining self._tokens = None self._dictionary = None self.ngram_range = (1, 1) self.attributes = {} self._pos_tags = None from orangecontrib.text.preprocess import PreprocessorList self.__used_preprocessor = PreprocessorList( []) # required for compute values self._titles: Optional[np.ndarray] = None self._pp_documents = None # preprocessed documents if text_features is None: self._infer_text_features() else: self.set_text_features(text_features) self._set_unique_titles()
def used_preprocessor(self, pp): from orangecontrib.text.preprocess import PreprocessorList, Preprocessor if isinstance(pp, PreprocessorList): self.__used_preprocessor = PreprocessorList(list(pp.preprocessors)) elif isinstance(pp, Preprocessor): self.__used_preprocessor.preprocessors.append(pp) else: raise NotImplementedError
def _base_tokens(self): from orangecontrib.text.preprocess import BASE_TRANSFORMER, \ BASE_TOKENIZER, PreprocessorList # don't use anything that requires NLTK data to assure async download base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER]) corpus = base_preprocessors(self) return corpus.tokens, corpus.dictionary
def set_corpus(self, data=None): self.corpus = data # create preprocessed corpus upon setting data to avoid preprocessing # at each method run pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer() ] self.pp_corpus = PreprocessorList(pp_list)(self.corpus) self.commit()
def test_used_preprocessors(self): corpus1 = self.corpus.copy() for pp in self.pp_list: corpus1 = pp(corpus1) self.assertEqual(len(self.corpus.used_preprocessor.preprocessors), 0) self.assertEqual(len(corpus1.used_preprocessor.preprocessors), 5) self.assertEqual([8, 10, 6, 8, 9, 7, 7, 10, 4], list(map(len, corpus1._tokens))) corpus2 = PreprocessorList(self.pp_list)(self.corpus) self.assertEqual(corpus1, corpus2)
def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, text_features=None, ids=None): """ Args: domain (Orange.data.Domain): the domain for this Corpus X (numpy.ndarray): attributes Y (numpy.ndarray): class variables metas (numpy.ndarray): meta attributes; e.g. text W (numpy.ndarray): instance weights text_features (list): meta attributes that are used for text mining. Infer them if None. ids (numpy.ndarray): Indices """ super().__init__() n_doc = _check_arrays(X, Y, metas) with self.unlocked_reference(): self.X = X if X is not None else np.zeros((n_doc, 0)) self.Y = Y if Y is not None else np.zeros((n_doc, 0)) self.metas = metas if metas is not None else np.zeros((n_doc, 0)) self.W = W if W is not None else np.zeros((n_doc, 0)) self.domain = domain self.text_features = [] # list of text features for mining self._tokens = None self._dictionary = None self._ngrams_corpus = None self.ngram_range = (1, 1) self.attributes = {} self._pos_tags = None from orangecontrib.text.preprocess import PreprocessorList self.__used_preprocessor = PreprocessorList( []) # required for compute values self._titles: Optional[np.ndarray] = None self._pp_documents = None # preprocessed documents if domain is not None and text_features is None: self._infer_text_features() elif domain is not None: self.set_text_features(text_features) if ids is not None: self.ids = ids else: Table._init_ids(self) self._set_unique_titles()
def set_corpus(self, data=None): self.corpus = data self.pp_corpus = None if self.corpus is not None: if not self.corpus.has_tokens(): # create preprocessed corpus upon setting data to avoid # preprocessing at each method run pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer() ] self.pp_corpus = PreprocessorList(pp_list)(self.corpus) else: self.pp_corpus = self.corpus self.commit.now()
def preprocess_only_words(corpus: Corpus) -> Corpus: """ Apply the preprocessor that splits words, transforms them to lower case (and removes punctuations). Parameters ---------- corpus Corpus on which the preprocessor will be applied. Returns ------- Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams. """ p = PreprocessorList( [LowercaseTransformer(), # by default regexp keeps only words (no punctuations, no spaces) RegexpTokenizer()] ) return p(corpus)
def test_result(self): pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()]) corpus = pp(Corpus.from_file("book-excerpts")[::3]) vect = BowVectorizer() corpus_vect = vect.transform(corpus) words = ["beheld", "events", "dragged", "basin", "visit", "have"] d = Domain([corpus_vect.domain[w] for w in words]) corpus_vect = corpus_vect.transform(d) self.send_signal(self.widget.Inputs.data, corpus_vect) self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1]) self.wait_until_finished(timeout=100000) np.testing.assert_array_almost_equal( self.widget.results.p_values, [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872], decimal=5, ) np.testing.assert_array_almost_equal( self.widget.results.fdr_values, [0.12766, 1, 0.12766, 0.12766, 0.12766, 1], decimal=5, )
def test_apply_preprocessors(self): corpus = PreprocessorList(self.pp_list)(self.corpus) self.assertEqual([8, 10, 6, 8, 9, 7, 7, 10, 4], list(map(len, corpus._tokens))) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 5)