def _prepare_data(self, data, save_preprocessed=False): words = [] phrases = [] for phrase in data: prep_phrase = self.preprocess(phrase) if save_preprocessed: self._data.append(prep_phrase) else: self._data.append(phrase) phrases.append(prep_phrase) words += prep_phrase.split() if isinstance(prep_phrase, str) else [prep_phrase] self._words_freq = Freq(words) self.__recovery_word_freq = self._words_freq.copy() self._phrases_freq = Freq(phrases) self.__recovery_phrase_freq = self._phrases_freq.copy()
class FreqTest(unittest.TestCase): def setUp(self) -> None: self.freq = Freq([1, 2, 3, 4, 4]) def test_probability(self): self.assertEqual(self.freq.probability, { 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.4 }) def test_item_prob(self): self.assertEqual(self.freq.item_prob(4), 0.4) def test_sanity(self): self.assertEqual(preprocess.separate('how are you?', sep='?'), 'how are you ?') self.assertEqual( preprocess.separate('how are you,man?', sep=('?', ','), between_char=True), 'how are you , man ?') self.assertEqual(preprocess.separate('how are! you?'), 'how are ! you ?') freq = Freq([1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 'hi', 'o', 'a']) self.assertEqual(freq.sample(max_freq=1), { 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 'hi': 1, 'o': 1, 'a': 1 }) self.assertEqual(freq.sample(freq=2), {1: 2, 3: 2, 2: 2}) self.assertRaises(AssertionError, freq.sample, freq=1, max_freq=2) self.assertRaises(AssertionError, freq.sample, freq=1, min_freq=2) freq = Freq([1, 2, 3, 3, 4, 5, 6, 7, 6, 7, 12, 31, 123, 5, 3]) self.assertEqual(freq.least_freq(), { 123: 1, 31: 1, 12: 1, 4: 1, 2: 1, 1: 1, 7: 2, 6: 2, 5: 2, 3: 3 })
def test_sanity(self): self.assertEqual(preprocess.separate('how are you?', sep='?'), 'how are you ?') self.assertEqual( preprocess.separate('how are you,man?', sep=('?', ','), between_char=True), 'how are you , man ?') self.assertEqual(preprocess.separate('how are! you?'), 'how are ! you ?') freq = Freq([1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 'hi', 'o', 'a']) self.assertEqual(freq.sample(max_freq=1), { 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 'hi': 1, 'o': 1, 'a': 1 }) self.assertEqual(freq.sample(freq=2), {1: 2, 3: 2, 2: 2}) self.assertRaises(AssertionError, freq.sample, freq=1, max_freq=2) self.assertRaises(AssertionError, freq.sample, freq=1, min_freq=2) freq = Freq([1, 2, 3, 3, 4, 5, 6, 7, 6, 7, 12, 31, 123, 5, 3]) self.assertEqual(freq.least_freq(), { 123: 1, 31: 1, 12: 1, 4: 1, 2: 1, 1: 1, 7: 2, 6: 2, 5: 2, 3: 3 })
class LanguageData(BaseData): def __init__(self, data, **kwargs): self._preprocessor = Preprocessor(hook=self.re_build, **kwargs) self._phrases_freq: Freq = ... self._words_freq: Freq = ... self.__recovery_phrase_freq: Freq = ... self.__recovery_word_freq: Freq = ... super().__init__(data=data) self._build = True @property def config(self): return self._preprocessor.config def re_build(self): if hasattr(self, '_build'): data = self._data.copy() self._data.clear() self._phrases_freq = None self._words_freq = None self.__recovery_phrase_freq = None self.__recovery_word_freq = None self._prepare_data(data) def reset_freq(self): self._phrases_freq = self.__recovery_phrase_freq.copy() self._words_freq = self.__recovery_word_freq.copy() @property def vocab_size(self): return len(self._words_freq) def __contains__(self, item): if item in self.words_freq: return True if item in self.phrases_freq: return True def __repr__(self): return f'LanguageData(examples: {len(self)} - vocab_size: {self.vocab_size})' def __len__(self): return len(self._data) def __iter__(self): return iter(self._data) @property def punctuation(self): return self.config.punctuation @property def words_freq(self) -> Freq: return self._words_freq @property def phrases_freq(self) -> Freq: return self._phrases_freq @property def data_prep(self): for phrase in self.data: yield self.preprocess(phrase) def word_freq(self, word: str): return self._words_freq.get(self.preprocess(word)) @property def words(self): return set(self._words_freq) def preprocess(self, value: Union[str, List[str]]): return self._preprocessor.preprocess(value) def synergy(self, value: Union[Iterable[str], str]) -> Number: """ Returns how important the value sent is in relation to the data set """ if value is None: return 0.0 if isinstance(value, str): value = [value] try: value = set(itertools.chain(*map(lambda val: self.preprocess(val).split(), value))) except AttributeError: raise ValueError("Inconsistent data format.") result = list(map(self._words_freq.item_prob, value)) zeros = result.count(0) if zeros: return sum(result) - (zeros / len(value)) return sum(result) def save_freq(self, save_on: str, prefix='freq', ext: str = 'json', probability=False): ext = ext.strip('.') # normalize save_on = Path(save_on) path_words = save_on.join(f'{prefix}_words.{ext}') self.words_freq.to_json(path_words, probability=probability, exist_ok=True) path_phrases = save_on.join(f'{prefix}_phrases.{ext}') self.phrases_freq.to_json(path_phrases, probability=probability, exist_ok=True) def _prepare_data(self, data, save_preprocessed=False): words = [] phrases = [] for phrase in data: prep_phrase = self.preprocess(phrase) if save_preprocessed: self._data.append(prep_phrase) else: self._data.append(phrase) phrases.append(prep_phrase) words += prep_phrase.split() if isinstance(prep_phrase, str) else [prep_phrase] self._words_freq = Freq(words) self.__recovery_word_freq = self._words_freq.copy() self._phrases_freq = Freq(phrases) self.__recovery_phrase_freq = self._phrases_freq.copy() def sample_words_freq(self, freq: int = None, max_items: int = -1, order='most_common'): return self._words_freq.sample(max_items=max_items, max_freq=freq, order=order) def sample_phrases_freq(self, freq: int = None, max_items: int = -1, order='most_common'): return self._phrases_freq.sample(max_items=max_items, max_freq=freq, order=order)
def setUp(self) -> None: self.freq = Freq([1, 2, 3, 4, 4])