def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode): if mode == Method.CHAR: # If word is too short, do not augment it. if len(tokens) < self.min_char: return None aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p) idxes = [i for i, t in enumerate(tokens)] if mode == Method.WORD: # skip stopwords idxes = [ i for i in idxes if self.stopwords is None or tokens[i] not in self.stopwords ] # skip short word idxes = [i for i in idxes if len(tokens[i]) >= self.min_char] elif mode == Method.CHAR: idxes = self.skip_aug(idxes, tokens) if len(idxes) == 0: if self.verbose > 0: exception = WarningException( name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None if len(idxes) < aug_cnt: aug_cnt = len(idxes) aug_idxes = self.sample(idxes, aug_cnt) return aug_idxes
def _get_aug_idxes(self, tokens): aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = self.pre_skip_aug(tokens, tuple_idx=0) word_idxes = self.skip_aug(word_idxes, tokens) if len(word_idxes) == 0: if self.verbose > 0: exception = WarningException(name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None if len(word_idxes) < aug_cnt: aug_cnt = len(word_idxes) aug_idexes = self.sample(word_idxes, aug_cnt) return aug_idexes
def _validate_augment(cls, data): if data is None or len(data) == 0: return [ WarningException(name=WarningName.INPUT_VALIDATION_WARNING, code=WarningCode.WARNING_CODE_001, msg=WarningMessage.LENGTH_IS_ZERO) ] return []
def _get_aug_idxes(self, tokens): aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = [ i for i, t in enumerate(tokens) if self.stopwords is None or t not in self.stopwords ] word_idxes = self.skip_aug(word_idxes, tokens) if len(word_idxes) == 0: if self.verbose > 0: exception = WarningException( name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None if len(word_idxes) < aug_cnt: aug_cnt = len(word_idxes) aug_probs = self.model.cal_tfidf(word_idxes, tokens) aug_idxes = [] # It is possible that no token is picked. So re-try retry_cnt = 3 possible_idxes = word_idxes.copy() for _ in range(retry_cnt): for i, p in zip(possible_idxes, aug_probs): if self.prob() < p: aug_idxes.append(i) possible_idxes.remove(i) if len(possible_idxes) == aug_cnt: break # If still cannot pick up, random pick index regrardless probability if len(aug_idxes) < aug_cnt: aug_idxes.extend( self.sample(possible_idxes, aug_cnt - len(aug_idxes))) aug_idxes = self.sample(aug_idxes, aug_cnt) return aug_idxes
def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode): aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p) if mode == Method.WORD: idxes = self.pre_skip_aug(tokens) elif mode == Method.CHAR: idxes = [i for i, t in enumerate(tokens)] idxes = self.skip_aug(idxes, tokens) if len(idxes) == 0: if self.verbose > 0: exception = WarningException( name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None if len(idxes) < aug_cnt: aug_cnt = len(idxes) aug_idxes = self.sample(idxes, aug_cnt) return aug_idxes
def _get_aug_idxes(self, tokens): aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = self.pre_skip_aug(tokens, tuple_idx=0) word_idxes = self.skip_aug(word_idxes, tokens) if len(word_idxes) == 0: if self.verbose > 0: exception = WarningException( name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None aug_idexes = [] for aug_idx in word_idxes: word_poses = PartOfSpeech.constituent2pos(tokens[aug_idx][1]) candidates = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly candidates.extend(self.model.predict(tokens[aug_idx][0])) else: for word_pos in word_poses: candidates.extend( self.model.predict(tokens[aug_idx][0], pos=word_pos)) candidates = [ c for c in candidates if c.lower() != tokens[aug_idx][0].lower() ] if len(candidates) > 0: candidate = self.sample(candidates, 1)[0] aug_idexes.append((aug_idx, candidate)) if len(aug_idexes) < aug_cnt: aug_cnt = len(aug_idexes) aug_idexes = self.sample(aug_idexes, aug_cnt) return aug_idexes