Example #1
0
 def present_participle(self, word):
     try:
         uw = self.inflect.present_participle(word)
         return uw if uw else word
     except Exception as e:
         log.debug(getattr(e, 'message', str(e)))
         return word
Example #2
0
 def pluralize_adj(self, word):
     try:
         uw = self.inflect.plural_adj(word)
         return uw if uw else word
     except Exception as e:
         log.debug(getattr(e, 'message', str(e)))
         return word
Example #3
0
 def singularize_noun(self, word):
     try:
         uw = self.inflect.singular_noun(word)
         return uw if uw else word
     except Exception as e:
         log.debug(getattr(e, 'message', str(e)))
         return word
Example #4
0
    def repeat_char(self, word):
        """
        Randomly repeat a letter in a word.
        """

        pos = utils.rand_index(word)
        uword = word[:pos] + word[pos] + word[pos:]
        log.debug("[repeat char] {} -> {}: %s".format(word, uword))
        return uword
Example #5
0
    def add_char(self, word):
        """
        Randomly add a vowel or a letter from a word.
        """

        pos = utils.rand_index(word)
        addition = random.choice(list(word + 'aeiou'))
        uword = word[:pos] + addition + word[pos:]
        log.debug("[add char] {} -> {}: %s".format(word, uword))
        return uword
Example #6
0
    def repeat_char(self, word):
        """
        Randomly repeat a letter in a word.
        """

        log.debug("repeat change---before: %s" % word)
        pos = utils.rand_index(word)
        word = word[:pos] + word[pos] + word[pos:]
        log.debug("repeat change---after: %s" % word)
        return word
Example #7
0
    def swap_chars(self, word):
        """
        Randomly swap letters between two positions in a word.
        """

        i, j = utils.rand_index(word), utils.rand_index(word)
        if i > j:
            i, j = j, i  # swaps i and j
        uword = word[:i] + word[j] + word[i + 1:j] + word[i] + word[j + 1:]
        log.debug("[swap char] {} -> {}: %s".format(word, uword))
        return uword
Example #8
0
    def swap_char(self, word):
        """
        Randomly swap two neighboring letters in a word.
        """

        log.debug("swap change---before: %s" % word)
        pos = utils.rand_index(word)
        i, j = (pos - 1, pos) if pos > 0 else (pos, pos + 1)
        word = word[:i] + word[j] + word[i] + word[j + 1:]
        log.debug("swap change---after: %s" %  word)
        return word
Example #9
0
    def add_char(self, word):
        """
        Randomly add vowel to word.
        """

        log.debug("addition change---before: %s" % word)
        pos = utils.rand_index(word)
        addition = random.choice(list(word+'aeiou'))
        word = word[:pos] + addition + word[pos:]
        log.debug("addition change---after: %s" % word)
        return word
Example #10
0
    def drop_char(self, word):
        """
        Drop char in a word.
        TODO test [vowel or double] only.
        """

        pos = utils.rand_index(word)
        if pos == 0:
            pos += 1  # don't drop first letter of a word
        uword = word[:pos] + word[pos + 1:]
        log.debug("[drop char] {} -> {}: %s".format(word, uword))
        return uword
Example #11
0
    def flip_char(self, word):
        """
        Randomly flip letters between two positions in a word.
        """

        log.debug("flip change---before: %s" % word)
        i, j = utils.rand_index(word), utils.rand_index(word)
        if i > j:
            i, j = j, i  # swaps i and j
        word = word[:i] + word[j] + word[i + 1:j] + word[i] + word[j + 1:]
        log.debug("flip change---after: %s" % word)
        return word
Example #12
0
    def drop_char(self, word):
        """
        Drop char in a word.
        TODO test [vowel or double] only.
        """

        log.debug("drop change---before: %s" % word)
        pos = utils.rand_index(word)
        if pos == 0:
            pos += 1  # don't drop first letter of a word
        uword = word[:pos] + word[pos+1:]
        log.debug("drop change---after: %s" % word)
        return uword
Example #13
0
    def flip_chars(self, word):
        """
        Flip chars based on the following schemes:

            1. ie, ou, iou, ae, ea
            2. gh, th, ng
            3. two, chars, random
        """

        uword = self.flip_vowel_pairs(word)
        if not uword:
            uword = self.flip_consonant_pairs(word)
            if not uword:
                uword = self.flip_rand_pairs(word)

        log.debug("[flip char] {} -> {}: %s".format(word, uword))
        return uword
Example #14
0
    def _inject_noise(self, parsed_sent):
        """
        Inject errors according to an overall rate.

        Returns:
            str: noised sentence
        """
        noised_sent = []
        prob = 1. - self.config.error_rate_overall

        for tok in parsed_sent:
            rand1, rand2 = random.random(), random.random()

            if tok.text.lower() in self.protected_tokens:
                noised_sent.append(tok.text)
                continue

            # Orthographic errors
            if rand1 >= prob and rand2 <= self.error_typo and len(
                    tok.text) > 4:
                typo = self.word_noiser.noise_word(tok.text)
                noised_sent.append(typo)

            # Swap current and previous words
            elif rand1 >= prob and rand2 <= self.error_swap and len(
                    noised_sent) > 1:
                prev_tok = noised_sent.pop()
                noised_sent.append(tok.text)
                noised_sent.append(prev_tok)

            # Determiners/Articles
            elif rand1 >= prob and tok.tag_ == self.pos_det:
                if tok.text.lower() in self.determiner_list:
                    if rand2 <= 0.15:
                        noised_sent.append('a')
                    elif rand2 <= 0.30:
                        noised_sent.append('an')
                    elif rand2 <= 0.45:
                        noised_sent.append('the')
                    elif rand2 <= 0.80 or len(noised_sent) == 0:
                        noised_sent.append(tok.text)
                    elif len(noised_sent) > 0:
                        pass
                else:
                    if rand2 <= 0.35:
                        noised_sent.append(self.pluralize(tok.text))
                    elif rand2 <= 0.85 or len(noised_sent) == 0:
                        noised_sent.append(tok.text)
                    elif len(noised_sent) > 0:
                        pass

            # Prepositions
            elif rand1 >= prob and tok.tag_ == self.pos_prep and tok.text.lower(
            ) in self.prep_list:
                if rand2 <= 0.10:
                    noised_sent.append('in')
                elif rand2 <= 0.20:
                    noised_sent.append('on')
                elif rand2 <= 0.30:
                    noised_sent.append('to')
                elif rand2 <= 0.40:
                    noised_sent.append('for')
                elif rand2 <= 0.80:
                    noised_sent.append(random.sample(self.prep_list, 1)[0])
                else:
                    pass

            # Nouns
            elif rand1 >= prob and tok.tag_ in self.pos_noun:
                if rand2 <= 0.45:
                    noised_sent.append(self.singularize_noun(tok.text))
                elif rand2 <= 0.80:
                    noised_sent.append(self.pluralize(tok.text))
                elif rand2 <= 0.90:
                    synonyms = self.synonyms_noun(tok.text)
                    if not len(synonyms):
                        synonyms = [tok.text]
                    noised_sent.append(random.sample(synonyms, 1)[0])
                else:
                    noised_sent.append(tok.text)

            # Verbs
            elif rand1 >= prob and tok.tag_ in self.pos_verb:
                if rand2 <= 0.20:
                    noised_sent.append(tok.lemma_)
                elif rand2 <= 0.45:
                    noised_sent.append(self.pluralize_verb(tok.text))
                elif rand2 <= 0.75:
                    noised_sent.append(self.present_participle(tok.text))
                elif rand2 <= 0.90:
                    synonyms = self.synonyms_verb(tok.text)
                    if not len(synonyms):
                        synonyms = [tok.text]
                    noised_sent.append(random.sample(synonyms, 1)[0])
                else:
                    noised_sent.append(tok.text)

            # Adverbs
            elif rand1 >= prob and tok.tag_ in self.pos_adv:
                if rand2 <= 0.35:
                    synonyms = self.synonyms_adv(tok.text)
                    if not len(synonyms):
                        synonyms = [tok.text]
                    noised_sent.append(random.sample(synonyms, 1)[0])
                else:
                    noised_sent.append(tok.text)

            # Adjectives
            elif rand1 >= prob and tok.tag_ in self.pos_adj:
                if rand2 <= 0.40:
                    noised_sent.append(self.pluralize_adj(tok.text))
                elif rand2 <= 0.60:
                    synonyms = self.synonyms_adj(tok.text)
                    if not len(synonyms):
                        synonyms = [tok.text]
                    noised_sent.append(random.sample(synonyms, 1)[0])
                elif rand2 <= 0.95:
                    noised_sent.append(tok.text)
                else:
                    pass

            # Punctuation
            elif rand1 >= prob and tok.tag_ in self.punc_list:
                if rand2 <= 0.60:
                    noised_sent.append(tok.text)
                elif rand2 <= 0.80:
                    noised_sent.append(random.sample(self.punc_list, 1)[0])
                else:
                    pass

            # After exhausting other schemes double-up for Orthographic errors
            elif rand1 >= prob and (rand2 / 2) <= self.error_typo and len(
                    tok.text) > 3:
                typo = self.word_noiser.noise_word(tok.text)
                noised_sent.append(typo)

            else:
                noised_sent.append(tok.text)
                log.debug('UNK POS: ' + tok.tag_)

            # Add redundant punctuation
            if rand1 >= prob and tok.tag_ not in self.punc_list:
                if rand2 <= 0.01:
                    noised_sent.append(random.sample(self.punc_list, 1)[0])

        return self.detok([t for t in noised_sent if t])