Beispiel #1
0
    def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode):
        if mode == Method.CHAR:
            # If word is too short, do not augment it.
            if len(tokens) < self.min_char:
                return None

        aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p)
        idxes = [i for i, t in enumerate(tokens)]
        if mode == Method.WORD:
            # skip stopwords
            idxes = [
                i for i in idxes
                if self.stopwords is None or tokens[i] not in self.stopwords
            ]
            # skip short word
            idxes = [i for i in idxes if len(tokens[i]) >= self.min_char]

        elif mode == Method.CHAR:
            idxes = self.skip_aug(idxes, tokens)

        if len(idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(
                    name=WarningName.OUT_OF_VOCABULARY,
                    code=WarningCode.WARNING_CODE_002,
                    msg=WarningMessage.NO_WORD)
                exception.output()
            return None
        if len(idxes) < aug_cnt:
            aug_cnt = len(idxes)
        aug_idxes = self.sample(idxes, aug_cnt)
        return aug_idxes
Beispiel #2
0
 def _get_aug_idxes(self, tokens):
     aug_cnt = self.generate_aug_cnt(len(tokens))
     word_idxes = self.pre_skip_aug(tokens, tuple_idx=0)
     word_idxes = self.skip_aug(word_idxes, tokens)
     if len(word_idxes) == 0:
         if self.verbose > 0:
             exception = WarningException(name=WarningName.OUT_OF_VOCABULARY,
                                          code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
             exception.output()
         return None
     if len(word_idxes) < aug_cnt:
         aug_cnt = len(word_idxes)
     aug_idexes = self.sample(word_idxes, aug_cnt)
     return aug_idexes
Beispiel #3
0
    def _validate_augment(cls, data):
        if data is None or len(data) == 0:
            return [
                WarningException(name=WarningName.INPUT_VALIDATION_WARNING,
                                 code=WarningCode.WARNING_CODE_001,
                                 msg=WarningMessage.LENGTH_IS_ZERO)
            ]

        return []
Beispiel #4
0
    def _get_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = [
            i for i, t in enumerate(tokens)
            if self.stopwords is None or t not in self.stopwords
        ]
        word_idxes = self.skip_aug(word_idxes, tokens)

        if len(word_idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(
                    name=WarningName.OUT_OF_VOCABULARY,
                    code=WarningCode.WARNING_CODE_002,
                    msg=WarningMessage.NO_WORD)
                exception.output()
            return None
        if len(word_idxes) < aug_cnt:
            aug_cnt = len(word_idxes)

        aug_probs = self.model.cal_tfidf(word_idxes, tokens)
        aug_idxes = []

        # It is possible that no token is picked. So re-try
        retry_cnt = 3
        possible_idxes = word_idxes.copy()
        for _ in range(retry_cnt):
            for i, p in zip(possible_idxes, aug_probs):
                if self.prob() < p:
                    aug_idxes.append(i)
                    possible_idxes.remove(i)

                    if len(possible_idxes) == aug_cnt:
                        break

        # If still cannot pick up, random pick index regrardless probability
        if len(aug_idxes) < aug_cnt:
            aug_idxes.extend(
                self.sample(possible_idxes, aug_cnt - len(aug_idxes)))

        aug_idxes = self.sample(aug_idxes, aug_cnt)

        return aug_idxes
Beispiel #5
0
    def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode):
        aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p)

        if mode == Method.WORD:
            idxes = self.pre_skip_aug(tokens)
        elif mode == Method.CHAR:
            idxes = [i for i, t in enumerate(tokens)]
            idxes = self.skip_aug(idxes, tokens)

        if len(idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(
                    name=WarningName.OUT_OF_VOCABULARY,
                    code=WarningCode.WARNING_CODE_002,
                    msg=WarningMessage.NO_WORD)
                exception.output()
            return None
        if len(idxes) < aug_cnt:
            aug_cnt = len(idxes)
        aug_idxes = self.sample(idxes, aug_cnt)
        return aug_idxes
Beispiel #6
0
    def _get_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = self.pre_skip_aug(tokens, tuple_idx=0)
        word_idxes = self.skip_aug(word_idxes, tokens)
        if len(word_idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(
                    name=WarningName.OUT_OF_VOCABULARY,
                    code=WarningCode.WARNING_CODE_002,
                    msg=WarningMessage.NO_WORD)
                exception.output()
            return None

        aug_idexes = []
        for aug_idx in word_idxes:
            word_poses = PartOfSpeech.constituent2pos(tokens[aug_idx][1])
            candidates = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                candidates.extend(self.model.predict(tokens[aug_idx][0]))
            else:
                for word_pos in word_poses:
                    candidates.extend(
                        self.model.predict(tokens[aug_idx][0], pos=word_pos))

            candidates = [
                c for c in candidates
                if c.lower() != tokens[aug_idx][0].lower()
            ]

            if len(candidates) > 0:
                candidate = self.sample(candidates, 1)[0]
                aug_idexes.append((aug_idx, candidate))

        if len(aug_idexes) < aug_cnt:
            aug_cnt = len(aug_idexes)

        aug_idexes = self.sample(aug_idexes, aug_cnt)
        return aug_idexes