Esempio n. 1
0
    def _prepare_data(self, data, save_preprocessed=False):
        words = []
        phrases = []
        for phrase in data:
            prep_phrase = self.preprocess(phrase)
            if save_preprocessed:
                self._data.append(prep_phrase)
            else:
                self._data.append(phrase)

            phrases.append(prep_phrase)

            words += prep_phrase.split() if isinstance(prep_phrase, str) else [prep_phrase]

        self._words_freq = Freq(words)
        self.__recovery_word_freq = self._words_freq.copy()

        self._phrases_freq = Freq(phrases)
        self.__recovery_phrase_freq = self._phrases_freq.copy()
Esempio n. 2
0
class FreqTest(unittest.TestCase):
    def setUp(self) -> None:
        self.freq = Freq([1, 2, 3, 4, 4])

    def test_probability(self):
        self.assertEqual(self.freq.probability, {
            1: 0.2,
            2: 0.2,
            3: 0.2,
            4: 0.4
        })

    def test_item_prob(self):
        self.assertEqual(self.freq.item_prob(4), 0.4)

    def test_sanity(self):
        self.assertEqual(preprocess.separate('how are you?', sep='?'),
                         'how are you ?')
        self.assertEqual(
            preprocess.separate('how are you,man?',
                                sep=('?', ','),
                                between_char=True), 'how are you , man ?')
        self.assertEqual(preprocess.separate('how are! you?'),
                         'how are ! you ?')

        freq = Freq([1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 'hi', 'o', 'a'])
        self.assertEqual(freq.sample(max_freq=1), {
            4: 1,
            5: 1,
            6: 1,
            7: 1,
            8: 1,
            'hi': 1,
            'o': 1,
            'a': 1
        })
        self.assertEqual(freq.sample(freq=2), {1: 2, 3: 2, 2: 2})

        self.assertRaises(AssertionError, freq.sample, freq=1, max_freq=2)
        self.assertRaises(AssertionError, freq.sample, freq=1, min_freq=2)

        freq = Freq([1, 2, 3, 3, 4, 5, 6, 7, 6, 7, 12, 31, 123, 5, 3])
        self.assertEqual(freq.least_freq(), {
            123: 1,
            31: 1,
            12: 1,
            4: 1,
            2: 1,
            1: 1,
            7: 2,
            6: 2,
            5: 2,
            3: 3
        })
Esempio n. 3
0
    def test_sanity(self):
        self.assertEqual(preprocess.separate('how are you?', sep='?'),
                         'how are you ?')
        self.assertEqual(
            preprocess.separate('how are you,man?',
                                sep=('?', ','),
                                between_char=True), 'how are you , man ?')
        self.assertEqual(preprocess.separate('how are! you?'),
                         'how are ! you ?')

        freq = Freq([1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 'hi', 'o', 'a'])
        self.assertEqual(freq.sample(max_freq=1), {
            4: 1,
            5: 1,
            6: 1,
            7: 1,
            8: 1,
            'hi': 1,
            'o': 1,
            'a': 1
        })
        self.assertEqual(freq.sample(freq=2), {1: 2, 3: 2, 2: 2})

        self.assertRaises(AssertionError, freq.sample, freq=1, max_freq=2)
        self.assertRaises(AssertionError, freq.sample, freq=1, min_freq=2)

        freq = Freq([1, 2, 3, 3, 4, 5, 6, 7, 6, 7, 12, 31, 123, 5, 3])
        self.assertEqual(freq.least_freq(), {
            123: 1,
            31: 1,
            12: 1,
            4: 1,
            2: 1,
            1: 1,
            7: 2,
            6: 2,
            5: 2,
            3: 3
        })
Esempio n. 4
0
class LanguageData(BaseData):

    def __init__(self, data, **kwargs):

        self._preprocessor = Preprocessor(hook=self.re_build, **kwargs)

        self._phrases_freq: Freq = ...
        self._words_freq: Freq = ...
        self.__recovery_phrase_freq: Freq = ...
        self.__recovery_word_freq: Freq = ...
        super().__init__(data=data)
        self._build = True

    @property
    def config(self):
        return self._preprocessor.config

    def re_build(self):
        if hasattr(self, '_build'):
            data = self._data.copy()
            self._data.clear()
            self._phrases_freq = None
            self._words_freq = None
            self.__recovery_phrase_freq = None
            self.__recovery_word_freq = None
            self._prepare_data(data)

    def reset_freq(self):
        self._phrases_freq = self.__recovery_phrase_freq.copy()
        self._words_freq = self.__recovery_word_freq.copy()

    @property
    def vocab_size(self):
        return len(self._words_freq)

    def __contains__(self, item):
        if item in self.words_freq:
            return True
        if item in self.phrases_freq:
            return True

    def __repr__(self):
        return f'LanguageData(examples: {len(self)} - vocab_size: {self.vocab_size})'

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        return iter(self._data)

    @property
    def punctuation(self):
        return self.config.punctuation

    @property
    def words_freq(self) -> Freq:
        return self._words_freq

    @property
    def phrases_freq(self) -> Freq:
        return self._phrases_freq

    @property
    def data_prep(self):
        for phrase in self.data:
            yield self.preprocess(phrase)

    def word_freq(self, word: str):
        return self._words_freq.get(self.preprocess(word))

    @property
    def words(self):
        return set(self._words_freq)

    def preprocess(self, value: Union[str, List[str]]):
        return self._preprocessor.preprocess(value)

    def synergy(self, value: Union[Iterable[str], str]) -> Number:
        """
        Returns how important the value sent is in relation to the data set
        """
        if value is None:
            return 0.0
        if isinstance(value, str):
            value = [value]
        try:
            value = set(itertools.chain(*map(lambda val: self.preprocess(val).split(), value)))
        except AttributeError:
            raise ValueError("Inconsistent data format.")
        result = list(map(self._words_freq.item_prob, value))
        zeros = result.count(0)
        if zeros:
            return sum(result) - (zeros / len(value))
        return sum(result)

    def save_freq(self, save_on: str, prefix='freq', ext: str = 'json', probability=False):
        ext = ext.strip('.')  # normalize
        save_on = Path(save_on)

        path_words = save_on.join(f'{prefix}_words.{ext}')
        self.words_freq.to_json(path_words, probability=probability, exist_ok=True)

        path_phrases = save_on.join(f'{prefix}_phrases.{ext}')
        self.phrases_freq.to_json(path_phrases, probability=probability, exist_ok=True)

    def _prepare_data(self, data, save_preprocessed=False):
        words = []
        phrases = []
        for phrase in data:
            prep_phrase = self.preprocess(phrase)
            if save_preprocessed:
                self._data.append(prep_phrase)
            else:
                self._data.append(phrase)

            phrases.append(prep_phrase)

            words += prep_phrase.split() if isinstance(prep_phrase, str) else [prep_phrase]

        self._words_freq = Freq(words)
        self.__recovery_word_freq = self._words_freq.copy()

        self._phrases_freq = Freq(phrases)
        self.__recovery_phrase_freq = self._phrases_freq.copy()

    def sample_words_freq(self, freq: int = None, max_items: int = -1, order='most_common'):
        return self._words_freq.sample(max_items=max_items, max_freq=freq, order=order)

    def sample_phrases_freq(self, freq: int = None, max_items: int = -1, order='most_common'):
        return self._phrases_freq.sample(max_items=max_items, max_freq=freq, order=order)
Esempio n. 5
0
 def setUp(self) -> None:
     self.freq = Freq([1, 2, 3, 4, 4])