def test_n_output_audio(self): audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file) flows = [ naf.Sequential( [naa.CropAug(sampling_rate=sampling_rate), naa.LoudnessAug()]), naf.Sometimes( [naa.CropAug(sampling_rate=sampling_rate), naa.LoudnessAug()], pipeline_p=0.9), naf.Sequential([ naf.Sequential([ naa.CropAug(sampling_rate=sampling_rate), naa.LoudnessAug() ]), naf.Sometimes([ naa.CropAug(sampling_rate=sampling_rate), naa.LoudnessAug() ], pipeline_p=0.9) ]) ] for flow in flows: augmented_audios = flow.augment(audio, n=3) self.assertGreater(len(augmented_audios), 1) for augmented_audio in augmented_audios: self.assertFalse(np.array_equal(audio, augmented_audio)) self.assertLess(0, len(flows))
def test_n_output_spectrogram(self): mel_spectrogram = AudioLoader.load_mel_spectrogram( self.sample_wav_file, n_mels=128) # flows = [ naf.Sequential([nas.FrequencyMaskingAug(), nas.TimeMaskingAug()]), naf.Sometimes([nas.FrequencyMaskingAug(), nas.TimeMaskingAug()], pipeline_p=0.9), naf.Sequential([ naf.Sequential( [nas.FrequencyMaskingAug(), nas.TimeMaskingAug()]), naf.Sometimes( [nas.FrequencyMaskingAug(), nas.TimeMaskingAug()], pipeline_p=0.9) ]) ] for flow in flows: augmented_mel_spectrograms = flow.augment(mel_spectrogram, n=3) self.assertGreater(len(augmented_mel_spectrograms), 1) for augmented_mel_spectrogram in augmented_mel_spectrograms: self.assertFalse( np.array_equal(mel_spectrogram, augmented_mel_spectrogram)) self.assertLess(0, len(flows))
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.5), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ]), naf.Sequential([ nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action=Action.SUBSTITUTE, aug_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.5) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: at_least_one_not_equal = False for _ in range(0, 5): for text in texts: self.assertLess(0, len(text)) augmented_text = flow.augment(text) if text != augmented_text: at_least_one_not_equal = True self.assertLess(0, len(text)) if at_least_one_not_equal: break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_multiple_actions(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584' ] flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.9), naf.Sequential( [ # nac.OcrAug(), nac.QwertyAug(aug_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ]), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=0.9) ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times for flow in flows: for text in texts: at_least_one_not_equal = False for _ in range(5): augmented_text = flow.augment(text, n=1) if text != augmented_text: at_least_one_not_equal = True break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def augmentation(text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def test_n_output_without_augmentation(self): texts = [ 'AAAAAAAAAAA AAAAAAAAAAAAAA' ] flows = [ naf.Sequential([ nac.OcrAug(), nac.OcrAug() ]), naf.Sometimes([ nac.RandomCharAug(), nac.RandomCharAug() ], pipeline_p=0.00001) ] for flow in flows: for text in texts: for _ in range(5): augmented_texts = flow.augment(text, n=3) all_not_equal = False for augmented_text in augmented_texts: if augmented_text != text: all_not_equal = True break if all_not_equal: break self.assertFalse(all_not_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def train_eval_dataset(dataset: pd.DataFrame,lang="ita",expansion=10): nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') nltk.download('omw') flow = naf.Sometimes([naw.SynonymAug(lang=lang, aug_min=10),naw.RandomWordAug("swap"),naw.RandomWordAug("delete"),nac.KeyboardAug()]) train_afert_exp=[] dev_after_exp=[] for idx, row in dataset.iterrows(): logging.info("[{}/{}] {}".format(idx, len(dataset), row["question"])) new_text = [new for new in flow.augment(row["question"], n=expansion)] train_afert_exp.append({"label": row["question_id"], "text": row["question"]}) th=int(len(new_text)*0.8) for text in new_text[:th]: train_afert_exp.append({"label": row["question_id"], "text": text}) for text in new_text[th:]: dev_after_exp.append({"label": row["question_id"], "text": text}) train=train_afert_exp dev=dev_after_exp train = pd.DataFrame(train).sample(frac=1.0) dev = pd.DataFrame(dev).sample(frac=1.0) return train, dev
def test_augment_detail(self): text = 'The quick brown fox jumps over the lazy dog' flows = [ naf.Sequential([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ], pipeline_p=0.5), naf.Sequential([ nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ], name='Sub_Seq') ], include_detail=True), naf.Sometimes([ naf.Sometimes([ nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="delete") ]), naf.Sequential([ nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6) ]) ], pipeline_p=1, include_detail=True) ] for flow in flows: augmented_text, augment_details = flow.augment(text) self.assertNotEqual(text, augmented_text) self.assertGreater(len(augment_details), 0) for augment_detail in augment_details: self.assertGreater(augment_detail['orig_start_pos'], -1) self.assertGreater(augment_detail['new_start_pos'], -1) self.assertGreater(augment_detail['change_seq'], 0) self.assertIn(augment_detail['action'], Action.getall())
def test_multi_thread(self): text = 'The quick brown fox jumps over the lazy dog' n = 3 w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word', 'word_embs', 'GoogleNews-vectors-negative300.bin') flows = [ naf.Sequential([ naf.Sequential([ nac.OcrAug(), naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ]), naf.Sequential([ nac.RandomCharAug(), ]), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ]), naf.Sometimes([ naf.Sequential([ nac.OcrAug(), nac.RandomCharAug(), ]), naf.Sometimes([ naw.WordEmbsAug(model_type='word2vec', model_path=w2v_model_path) ], pipeline_p=0.999), naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute", temperature=0.7, device='cpu') ], pipeline_p=0.9999) ] for num_thread in [1, 3]: for flow in flows: augmented_data = flow.augment(text, n=n, num_thread=num_thread) self.assertEqual(len(augmented_data), n)
def parse(config: dict) -> list: augmentations = [] for key, value in config.items(): au = AUGMENTATIONS.get(key, None) if au is None: raise KeyError(f"No augmentation named: {key}\n" f"Available augmentations: {AUGMENTATIONS.keys()}") aug = au(**value) if value is not None else au() augmentations.append(aug) return naf.Sometimes(augmentations)
def nlpaug(word): aug = naf.Sometimes([ nac.OcrAug(), nac.KeyboardAug(), nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="substitute"), nac.RandomCharAug(action="swap"), nac.RandomCharAug(action="delete"), naw.SpellingAug(), ]) word = aug.augment(word) return word
def test_n_output(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584', 'AAAAAAAAAAA AAAAAAAAAAAAAA' ] flows = [ naf.Sequential( [nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug()]), naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.9), naf.Sequential([ naf.Sequential([ nac.RandomCharAug(action=Action.INSERT), naw.RandomWordAug() ]), naf.Sometimes([ nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE) ], pipeline_p=0.9) ]) ] for flow in flows: for text in texts: augmented_texts = flow.augment(text, n=3) self.assertGreater(len(augmented_texts), 1) for augmented_text in augmented_texts: self.assertNotEqual(augmented_text, text) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def augmentation(self, text, insert=False, substitute=False, swap=True, delete=True): augs = [] if insert: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="insert", device='cuda') # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz aug = naw.WordEmbsAug( model_type='word2vec', model_path= '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin', action="insert") augs.append(aug) if substitute: # aug = naw.ContextualWordEmbsAug( # model_path=self.model_type, action="substitute", device='cuda') # aug = naw.WordEmbsAug( # model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin', # action="substitute") aug_sub = naw.SynonymAug(aug_src='wordnet') augs.append(aug_sub) # text = aug.augment(text) if swap: aug_swap = naw.RandomWordAug(action="swap") augs.append(aug_swap) # text = aug.augment(text) if delete: aug_del = naw.RandomWordAug() augs.append(aug_del) # text = aug.augment(text) aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5) # print("before aug:", text) text = aug.augment(text, n=1) # print("after aug:", text) return text
def test_n_output_without_augmentation(self): texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA'] flows = [ naf.Sequential([nac.OcrAug(), nac.OcrAug()]), naf.Sometimes( [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001) ] for flow in flows: for text in texts: at_least_one_equal = False for _ in range(5): augmented_texts = flow.augment(text, n=3) if len(augmented_texts ) == 1 and augmented_texts[0] == text: at_least_one_equal = True break self.assertTrue(at_least_one_equal) self.assertLess(0, len(flows)) self.assertLess(0, len(texts))
def test_single_action(self): texts = [ 'The quick brown fox jumps over the lazy dog', 'Zology raku123456 fasdasd asd4123414 1234584 s@#' ] # Since prob may be low and causing do not perform data augmentation. Retry 5 times at_least_one_not_equal = False for _ in range(0, 5): flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], pipeline_p=0.6) for text in texts: augmented_text = flow.augment(text) if text != augmented_text: at_least_one_not_equal = True self.assertLess(0, len(text)) if at_least_one_not_equal: break self.assertTrue(at_least_one_not_equal) self.assertLess(0, len(texts))
def test_dry_run(self): seq = naf.Sometimes() results = seq.augment([]) self.assertEqual(0, len(results))