Ejemplo n.º 1
0
 def __init__(self,
              dataset: List[str] = [],
              min_term_frequency: float = 1,
              max_document_frequency: float = 0,
              lowercase: bool = False,
              tokenizer_strategy: TokenizerStrategy = TokenizerStrategy.
              NLTK_BASE,
              tokenizer_language: str = 'english'):
     """
     :param dataset: corpus used to calculate vocabulary
     :param min_term_frequency: min number of occurrences of word across whole corpus, if a value in the interval
     of ]0,1[ is used, this is considered as a fraction of the total word count, otherwise absolute count
     :param max_document_frequency: maximum number of documents the word is allowed to appear on average
     (multiple occurrences within a single document count too);
     if a value in the interval of ]0,1[ is used, this is considered as a fraction of the total document count
     :param lowercase: convert all texts to lowercase
     :param tokenizer_strategy: strategy used to obtain individual words
     :param tokenizer_language: text-language used at tokenization
     """
     self._dataset = dataset
     self._min_tf = min_term_frequency
     self._max_df = max_document_frequency
     self.min_tf = 1
     self.max_df = None
     self.vocabulary = {}
     self._lowercase = lowercase
     self._tokenizer_language = tokenizer_language
     self._tokenizer_strategy = tokenizer_strategy
     self._cva = CountVectorizerAnalyzer(dataset,
                                         lowercase=lowercase,
                                         language=tokenizer_language,
                                         strategy=tokenizer_strategy)
     super().__init__()
Ejemplo n.º 2
0
 def load_parameters(self, dataset: List[str], **kwargs):
     self._dataset = dataset
     self._cva = CountVectorizerAnalyzer(dataset,
                                         lowercase=self._lowercase,
                                         language=self._tokenizer_language,
                                         strategy=self._tokenizer_strategy)
     super().load_parameters(**kwargs)
 def test_nltk_base_vectorizer_batched(self):
     cva = CountVectorizerAnalyzer(texts,
                                   strategy=TokenizerStrategy.NLTK_BASE)
     bm = cva.extract_batch_metrics()
     # 2 question marks in third sentence
     self.assertEqual(
         2, bm[2,
               cva.count_vectorizer.get_feature_names_out().tolist().
               index('?')])
Ejemplo n.º 4
0
def build_word_vocab(
        texts: List[str] = None,
        max_n=None,
        count_vectorizer_analyzer: CountVectorizerAnalyzer = None
) -> List[str]:
    count_vectorizer_analyzer = count_vectorizer_analyzer or CountVectorizerAnalyzer(
        texts)
    top_words = count_vectorizer_analyzer.extract_dataset_metric().index
    if max_n is None:
        return top_words.to_list()
    return top_words[:max_n].to_list()
 def test_wordpunkt_vectorizer(self):
     df_res = CountVectorizerAnalyzer(
         texts,
         strategy=TokenizerStrategy.WORD_PUNKT).extract_dataset_metric()
     self.assertEqual(
         {
             '?': 4,
             ',': 3,
             'you': 3,
             'in': 2,
             'words': 2,
             'how': 2,
             'handle': 2,
             'do': 2,
             'sentence': 1,
             'no': 1,
             'of': 1,
             'otheruser12': 1,
             'right': 1,
             's': 1,
             'spaces': 1,
             'mighta': 1,
             'the': 1,
             'this': 1,
             'twitter': 1,
             'u': 1,
             'uppercase': 1,
             'username213': 1,
             'vocabulary': 1,
             'xd': 1,
             'missing': 1,
             '.?': 1,
             'many': 1,
             '3': 1,
             ':': 1,
             '<': 1,
             '.': 1,
             '?@': 1,
             '@': 1,
             'also': 1,
             'and': 1,
             'are': 1,
             'bedifferent': 1,
             'can': 1,
             'compound': 1,
             'distinguish': 1,
             'ehm': 1,
             'hu': 1,
             '-': 1,
             'lowercase': 1
         },
         df_res.to_dict()['count'])
 def test_twitter_vectorizer(self):
     df_res = CountVectorizerAnalyzer(
         texts,
         strategy=TokenizerStrategy.NLTK_TWEET).extract_dataset_metric()
     # twitter handles are preserved, emojis are preserved (<3)
     self.assertEqual(
         {
             '?': 6,
             ',': 3,
             'you': 3,
             'in': 2,
             '.': 2,
             'how': 2,
             'handle': 2,
             'do': 2,
             'missing': 1,
             'of': 1,
             'right': 1,
             's': 1,
             'sentence': 1,
             'spaces': 1,
             'the': 1,
             'this': 1,
             'twitter': 1,
             'u': 1,
             'uppercase': 1,
             'vocabulary': 1,
             'words': 1,
             'xd': 1,
             'no': 1,
             '<3': 1,
             'mighta': 1,
             ':': 1,
             'lowercase': 1,
             'hu': 1,
             'ehm': 1,
             'distinguish': 1,
             'compound-words': 1,
             'can': 1,
             'bedifferent': 1,
             'are': 1,
             'and': 1,
             'also': 1,
             '@username213': 1,
             '@otheruser12': 1,
             'many': 1
         },
         df_res.to_dict()['count'])
 def test_nltk_base_vectorizer_de(self):
     df_res = CountVectorizerAnalyzer(texts, strategy=TokenizerStrategy.NLTK_BASE, language='german') \
         .extract_dataset_metric()
     # note u.s. is now split into 'u.s' and '.' (compare to english nltk base)
     self.assertEqual(
         {
             '?': 6,
             ',': 3,
             'you': 3,
             'do': 2,
             '@': 2,
             'in': 2,
             'how': 2,
             'handle': 2,
             ':': 1,
             'otheruser12': 1,
             'right': 1,
             'sentence': 1,
             'spaces': 1,
             'the': 1,
             'this': 1,
             'no': 1,
             'twitter': 1,
             'u.s': 1,
             'uppercase': 1,
             'username213': 1,
             'vocabulary': 1,
             'words': 1,
             'xd': 1,
             'of': 1,
             'many': 1,
             'missing': 1,
             'mighta': 1,
             '<': 1,
             '.': 1,
             'hu': 1,
             'ehm': 1,
             'distinguish': 1,
             'compound-words': 1,
             'can': 1,
             'bedifferent': 1,
             'are': 1,
             'and': 1,
             'also': 1,
             '3': 1,
             'lowercase': 1
         },
         df_res.to_dict()['count'])
 def test_spacy_vectorizer_de(self):
     df_res = CountVectorizerAnalyzer(texts, strategy=TokenizerStrategy.SPACY, language='german') \
         .extract_dataset_metric()
     # interestingly 'compound-words' is treated differently in comparison to the english spacy model
     self.assertEqual(
         {
             '?': 5,
             'you': 3,
             ',': 3,
             'do': 2,
             'how': 2,
             'handle': 2,
             'in': 2,
             'also': 1,
             'missing': 1,
             'xd': 1,
             'words': 1,
             'vocabulary': 1,
             'uppercase': 1,
             'u.s': 1,
             'twitter': 1,
             'this': 1,
             'the': 1,
             'spaces': 1,
             'sentence': 1,
             'right?@username213': 1,
             'of': 1,
             'no': 1,
             'mighta': 1,
             'and': 1,
             'many': 1,
             'lowercase': 1,
             '.': 1,
             'hu': 1,
             ':': 1,
             '<3': 1,
             'ehm': 1,
             '@otheruser12': 1,
             'distinguish': 1,
             'compound-words': 1,
             'can': 1,
             'bedifferent': 1,
             'are': 1,
             ' ': 1
         },
         df_res.to_dict()['count'])
 def test_regex_vectorizer_with_casing(self):
     df_res = CountVectorizerAnalyzer(texts, strategy=TokenizerStrategy.REGEX, lowercase=False) \
         .extract_dataset_metric()
     self.assertEqual(
         {
             'in': 2,
             'do': 2,
             'words': 2,
             'handle': 2,
             'you': 2,
             'How': 2,
             'Twitter': 1,
             'miGhta': 1,
             'xD': 1,
             'vocabulary': 1,
             'username213': 1,
             'uppercase': 1,
             'this': 1,
             'the': 1,
             'spaces': 1,
             'sentence': 1,
             'right': 1,
             'otheruser12': 1,
             'of': 1,
             'missing': 1,
             'many': 1,
             'U': 1,
             'lowercase': 1,
             'Ehm': 1,
             'hu': 1,
             'No': 1,
             'S': 1,
             'distinguish': 1,
             'compound': 1,
             'can': 1,
             'bedifferent': 1,
             'are': 1,
             'and': 1,
             'also': 1,
             'You': 1,
             '3': 1
         },
         df_res.to_dict()['count'])
 def test_python_vectorizer(self):
     df_res = CountVectorizerAnalyzer(
         texts, strategy=TokenizerStrategy.PYTHON).extract_dataset_metric()
     self.assertEqual(
         {
             'you': 3,
             'in': 2,
             'how': 2,
             'handle': 2,
             'the': 1,
             'right?@username213:': 1,
             'sentence?': 1,
             'spaces?no?': 1,
             'this': 1,
             'missing': 1,
             'twitter': 1,
             'u.s.?': 1,
             'uppercase': 1,
             'vocabulary': 1,
             'words': 1,
             'xd': 1,
             'of': 1,
             '<3': 1,
             'mighta': 1,
             '@otheruser12': 1,
             'lowercase,': 1,
             'ehm,do': 1,
             'do': 1,
             'distinguish': 1,
             'compound-words': 1,
             'can': 1,
             'bedifferent,hu?': 1,
             'are': 1,
             'and': 1,
             'also': 1,
             'many': 1
         },
         df_res.to_dict()['count'])
Ejemplo n.º 11
0
class VocabularyWordSimplifier(BaseSimplifier):
    def __init__(self,
                 dataset: List[str] = [],
                 min_term_frequency: float = 1,
                 max_document_frequency: float = 0,
                 lowercase: bool = False,
                 tokenizer_strategy: TokenizerStrategy = TokenizerStrategy.
                 NLTK_BASE,
                 tokenizer_language: str = 'english'):
        """
        :param dataset: corpus used to calculate vocabulary
        :param min_term_frequency: min number of occurrences of word across whole corpus, if a value in the interval
        of ]0,1[ is used, this is considered as a fraction of the total word count, otherwise absolute count
        :param max_document_frequency: maximum number of documents the word is allowed to appear on average
        (multiple occurrences within a single document count too);
        if a value in the interval of ]0,1[ is used, this is considered as a fraction of the total document count
        :param lowercase: convert all texts to lowercase
        :param tokenizer_strategy: strategy used to obtain individual words
        :param tokenizer_language: text-language used at tokenization
        """
        self._dataset = dataset
        self._min_tf = min_term_frequency
        self._max_df = max_document_frequency
        self.min_tf = 1
        self.max_df = None
        self.vocabulary = {}
        self._lowercase = lowercase
        self._tokenizer_language = tokenizer_language
        self._tokenizer_strategy = tokenizer_strategy
        self._cva = CountVectorizerAnalyzer(dataset,
                                            lowercase=lowercase,
                                            language=tokenizer_language,
                                            strategy=tokenizer_strategy)
        super().__init__()

    def _init_statistics(self):
        self.min_tf = 1.0
        df_vocab = self._cva.extract_dataset_metric()
        if self._min_tf < 1:
            self.min_tf = float(self._min_tf * df_vocab.sum())
        elif self._min_tf > 1:
            self.min_tf = self._min_tf

        if not self._max_df:
            self.max_df = None
        elif self._max_df < 1:
            self.max_df = float(self._max_df * len(self._dataset))
        else:
            self.max_df = self._max_df

        df_vocab['count'] = df_vocab['count'].astype(float)
        selection_clause = df_vocab['count'] >= self.min_tf
        if self.max_df:
            selection_clause &= df_vocab['count'] <= self.max_df

        self.vocabulary = set(df_vocab[selection_clause].index)

    def can_init_statistics(self) -> bool:
        return bool(self._dataset)

    def simplify_text(self, text: str) -> str:
        if self._lowercase:
            text = text.lower()

        tokens = self._cva.tokenizer.tokenize_text(text)
        text = ' '.join(
            [token for token in tokens if token in self.vocabulary])

        return text

    def load_parameters(self, dataset: List[str], **kwargs):
        self._dataset = dataset
        self._cva = CountVectorizerAnalyzer(dataset,
                                            lowercase=self._lowercase,
                                            language=self._tokenizer_language,
                                            strategy=self._tokenizer_strategy)
        super().load_parameters(**kwargs)