def set_tokens(self): if self.corpus is None: self.tokens = None return self.tokens = WordPunctTokenizer()(self.corpus).tokens self.n_tokens = sum(map(len, self.tokens)) self.n_types = len(set(chain.from_iterable(self.tokens)))
def set_tokens(self): if self.corpus is None: self.tokens = None return tokenizer = WordPunctTokenizer() self.tokens = tokenizer(self.corpus.documents) self.n_tokens = sum(map(len, self.tokens)) self.n_types = len(set(chain.from_iterable(self.tokens)))
def _compute_indices(self): # type: () -> Optional[None, list] if self.corpus is None: self.indices = None return if self.corpus and not self.corpus.has_tokens(): preprocessor = Preprocessor(tokenizer=WordPunctTokenizer()) preprocessor(self.corpus) self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower()) for doc in self.corpus.tokens]
def transform(self, corpus): scores = [] corpus = WordPunctTokenizer()(corpus) for doc in corpus.tokens: pos_words = sum(word in self.positive for word in doc) neg_words = sum(word in self.negative for word in doc) scores.append([100 * (pos_words - neg_words) / max(len(doc), 1)]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [ VectorizationComputeValue(shared_cv, col) for col in self.sentiments ] corpus = corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus
from orangecontrib.text.preprocess import (FrequencyFilter, LowercaseTransformer, WordPunctTokenizer) __all__ = ['Preprocessor', 'base_preprocessor'] # don't use anything that requires NLTK data to assure async download BASE_TOKENIZER = WordPunctTokenizer() BASE_TRANSFORMERS = [LowercaseTransformer()] class Preprocessor: """Holds document processing objects. Attributes: transformers (List([BaseTransformer]): transforms strings tokenizer (BaseTokenizer): tokenizes string normalizer (BaseNormalizer): normalizes tokens filters (List[BaseTokenFilter]): filters unneeded tokens """ def __init__(self, transformers=None, tokenizer=None, normalizer=None, filters=None, ngrams_range=None, pos_tagger=None): if callable(transformers): transformers = [transformers]