def present_participle(self, word): try: uw = self.inflect.present_participle(word) return uw if uw else word except Exception as e: log.debug(getattr(e, 'message', str(e))) return word
def pluralize_adj(self, word): try: uw = self.inflect.plural_adj(word) return uw if uw else word except Exception as e: log.debug(getattr(e, 'message', str(e))) return word
def singularize_noun(self, word): try: uw = self.inflect.singular_noun(word) return uw if uw else word except Exception as e: log.debug(getattr(e, 'message', str(e))) return word
def repeat_char(self, word): """ Randomly repeat a letter in a word. """ pos = utils.rand_index(word) uword = word[:pos] + word[pos] + word[pos:] log.debug("[repeat char] {} -> {}: %s".format(word, uword)) return uword
def add_char(self, word): """ Randomly add a vowel or a letter from a word. """ pos = utils.rand_index(word) addition = random.choice(list(word + 'aeiou')) uword = word[:pos] + addition + word[pos:] log.debug("[add char] {} -> {}: %s".format(word, uword)) return uword
def repeat_char(self, word): """ Randomly repeat a letter in a word. """ log.debug("repeat change---before: %s" % word) pos = utils.rand_index(word) word = word[:pos] + word[pos] + word[pos:] log.debug("repeat change---after: %s" % word) return word
def swap_chars(self, word): """ Randomly swap letters between two positions in a word. """ i, j = utils.rand_index(word), utils.rand_index(word) if i > j: i, j = j, i # swaps i and j uword = word[:i] + word[j] + word[i + 1:j] + word[i] + word[j + 1:] log.debug("[swap char] {} -> {}: %s".format(word, uword)) return uword
def swap_char(self, word): """ Randomly swap two neighboring letters in a word. """ log.debug("swap change---before: %s" % word) pos = utils.rand_index(word) i, j = (pos - 1, pos) if pos > 0 else (pos, pos + 1) word = word[:i] + word[j] + word[i] + word[j + 1:] log.debug("swap change---after: %s" % word) return word
def add_char(self, word): """ Randomly add vowel to word. """ log.debug("addition change---before: %s" % word) pos = utils.rand_index(word) addition = random.choice(list(word+'aeiou')) word = word[:pos] + addition + word[pos:] log.debug("addition change---after: %s" % word) return word
def drop_char(self, word): """ Drop char in a word. TODO test [vowel or double] only. """ pos = utils.rand_index(word) if pos == 0: pos += 1 # don't drop first letter of a word uword = word[:pos] + word[pos + 1:] log.debug("[drop char] {} -> {}: %s".format(word, uword)) return uword
def flip_char(self, word): """ Randomly flip letters between two positions in a word. """ log.debug("flip change---before: %s" % word) i, j = utils.rand_index(word), utils.rand_index(word) if i > j: i, j = j, i # swaps i and j word = word[:i] + word[j] + word[i + 1:j] + word[i] + word[j + 1:] log.debug("flip change---after: %s" % word) return word
def drop_char(self, word): """ Drop char in a word. TODO test [vowel or double] only. """ log.debug("drop change---before: %s" % word) pos = utils.rand_index(word) if pos == 0: pos += 1 # don't drop first letter of a word uword = word[:pos] + word[pos+1:] log.debug("drop change---after: %s" % word) return uword
def flip_chars(self, word): """ Flip chars based on the following schemes: 1. ie, ou, iou, ae, ea 2. gh, th, ng 3. two, chars, random """ uword = self.flip_vowel_pairs(word) if not uword: uword = self.flip_consonant_pairs(word) if not uword: uword = self.flip_rand_pairs(word) log.debug("[flip char] {} -> {}: %s".format(word, uword)) return uword
def _inject_noise(self, parsed_sent): """ Inject errors according to an overall rate. Returns: str: noised sentence """ noised_sent = [] prob = 1. - self.config.error_rate_overall for tok in parsed_sent: rand1, rand2 = random.random(), random.random() if tok.text.lower() in self.protected_tokens: noised_sent.append(tok.text) continue # Orthographic errors if rand1 >= prob and rand2 <= self.error_typo and len( tok.text) > 4: typo = self.word_noiser.noise_word(tok.text) noised_sent.append(typo) # Swap current and previous words elif rand1 >= prob and rand2 <= self.error_swap and len( noised_sent) > 1: prev_tok = noised_sent.pop() noised_sent.append(tok.text) noised_sent.append(prev_tok) # Determiners/Articles elif rand1 >= prob and tok.tag_ == self.pos_det: if tok.text.lower() in self.determiner_list: if rand2 <= 0.15: noised_sent.append('a') elif rand2 <= 0.30: noised_sent.append('an') elif rand2 <= 0.45: noised_sent.append('the') elif rand2 <= 0.80 or len(noised_sent) == 0: noised_sent.append(tok.text) elif len(noised_sent) > 0: pass else: if rand2 <= 0.35: noised_sent.append(self.pluralize(tok.text)) elif rand2 <= 0.85 or len(noised_sent) == 0: noised_sent.append(tok.text) elif len(noised_sent) > 0: pass # Prepositions elif rand1 >= prob and tok.tag_ == self.pos_prep and tok.text.lower( ) in self.prep_list: if rand2 <= 0.10: noised_sent.append('in') elif rand2 <= 0.20: noised_sent.append('on') elif rand2 <= 0.30: noised_sent.append('to') elif rand2 <= 0.40: noised_sent.append('for') elif rand2 <= 0.80: noised_sent.append(random.sample(self.prep_list, 1)[0]) else: pass # Nouns elif rand1 >= prob and tok.tag_ in self.pos_noun: if rand2 <= 0.45: noised_sent.append(self.singularize_noun(tok.text)) elif rand2 <= 0.80: noised_sent.append(self.pluralize(tok.text)) elif rand2 <= 0.90: synonyms = self.synonyms_noun(tok.text) if not len(synonyms): synonyms = [tok.text] noised_sent.append(random.sample(synonyms, 1)[0]) else: noised_sent.append(tok.text) # Verbs elif rand1 >= prob and tok.tag_ in self.pos_verb: if rand2 <= 0.20: noised_sent.append(tok.lemma_) elif rand2 <= 0.45: noised_sent.append(self.pluralize_verb(tok.text)) elif rand2 <= 0.75: noised_sent.append(self.present_participle(tok.text)) elif rand2 <= 0.90: synonyms = self.synonyms_verb(tok.text) if not len(synonyms): synonyms = [tok.text] noised_sent.append(random.sample(synonyms, 1)[0]) else: noised_sent.append(tok.text) # Adverbs elif rand1 >= prob and tok.tag_ in self.pos_adv: if rand2 <= 0.35: synonyms = self.synonyms_adv(tok.text) if not len(synonyms): synonyms = [tok.text] noised_sent.append(random.sample(synonyms, 1)[0]) else: noised_sent.append(tok.text) # Adjectives elif rand1 >= prob and tok.tag_ in self.pos_adj: if rand2 <= 0.40: noised_sent.append(self.pluralize_adj(tok.text)) elif rand2 <= 0.60: synonyms = self.synonyms_adj(tok.text) if not len(synonyms): synonyms = [tok.text] noised_sent.append(random.sample(synonyms, 1)[0]) elif rand2 <= 0.95: noised_sent.append(tok.text) else: pass # Punctuation elif rand1 >= prob and tok.tag_ in self.punc_list: if rand2 <= 0.60: noised_sent.append(tok.text) elif rand2 <= 0.80: noised_sent.append(random.sample(self.punc_list, 1)[0]) else: pass # After exhausting other schemes double-up for Orthographic errors elif rand1 >= prob and (rand2 / 2) <= self.error_typo and len( tok.text) > 3: typo = self.word_noiser.noise_word(tok.text) noised_sent.append(typo) else: noised_sent.append(tok.text) log.debug('UNK POS: ' + tok.tag_) # Add redundant punctuation if rand1 >= prob and tok.tag_ not in self.punc_list: if rand2 <= 0.01: noised_sent.append(random.sample(self.punc_list, 1)[0]) return self.detok([t for t in noised_sent if t])