Beispiel #1
0
 def set_tokens(self):
     if self.corpus is None:
         self.tokens = None
         return
     self.tokens = WordPunctTokenizer()(self.corpus).tokens
     self.n_tokens = sum(map(len, self.tokens))
     self.n_types = len(set(chain.from_iterable(self.tokens)))
Beispiel #2
0
 def set_tokens(self):
     if self.corpus is None:
         self.tokens = None
         return
     tokenizer = WordPunctTokenizer()
     self.tokens = tokenizer(self.corpus.documents)
     self.n_tokens = sum(map(len, self.tokens))
     self.n_types = len(set(chain.from_iterable(self.tokens)))
Beispiel #3
0
 def _compute_indices(self):  # type: () -> Optional[None, list]
     if self.corpus is None:
         self.indices = None
         return
     if self.corpus and not self.corpus.has_tokens():
         preprocessor = Preprocessor(tokenizer=WordPunctTokenizer())
         preprocessor(self.corpus)
     self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower())
                     for doc in self.corpus.tokens]
Beispiel #4
0
    def transform(self, corpus):
        scores = []
        corpus = WordPunctTokenizer()(corpus)

        for doc in corpus.tokens:
            pos_words = sum(word in self.positive for word in doc)
            neg_words = sum(word in self.negative for word in doc)
            scores.append([100 * (pos_words - neg_words) / max(len(doc), 1)])
        X = np.array(scores).reshape((-1, len(self.sentiments)))

        # set  compute values
        shared_cv = SharedTransform(self)
        cv = [
            VectorizationComputeValue(shared_cv, col)
            for col in self.sentiments
        ]

        corpus = corpus.extend_attributes(X,
                                          self.sentiments,
                                          compute_values=cv)
        return corpus
Beispiel #5
0
from orangecontrib.text.preprocess import (FrequencyFilter,
                                           LowercaseTransformer,
                                           WordPunctTokenizer)

__all__ = ['Preprocessor', 'base_preprocessor']

# don't use anything that requires NLTK data to assure async download
BASE_TOKENIZER = WordPunctTokenizer()
BASE_TRANSFORMERS = [LowercaseTransformer()]


class Preprocessor:
    """Holds document processing objects.

    Attributes:
        transformers (List([BaseTransformer]): transforms strings
        tokenizer (BaseTokenizer): tokenizes string
        normalizer (BaseNormalizer): normalizes tokens
        filters (List[BaseTokenFilter]): filters unneeded tokens
    """
    def __init__(self,
                 transformers=None,
                 tokenizer=None,
                 normalizer=None,
                 filters=None,
                 ngrams_range=None,
                 pos_tagger=None):

        if callable(transformers):
            transformers = [transformers]