def substitute(self, text): results = [] tokens = self.tokenizer(text) pos = nltk.pos_tag(tokens) aug_idxes = self._get_aug_idxes(pos) if aug_idxes is None: return text for i, token in enumerate(tokens): # Skip if no augment for word if i not in aug_idxes: results.append(token) continue word_poses = PartOfSpeech.pos2wn(pos[i][1]) synets = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly synets.extend(self.model.synsets(pos[i][0], lang=self.lang)) else: for word_pos in word_poses: synets.extend( self.model.synsets(pos[i][0], pos=word_pos, lang=self.lang)) augmented_data = [] for synet in synets: candidates = [] for lema in synet.lemmas(): if self.synonyms: candidates.append(lema.name()) else: if lema.antonyms(): candidates.append(lema.antonyms()[0].name()) for candidate in candidates: if candidate.lower() != token.lower(): augmented_data.append(candidate) if len(augmented_data) == 0: results.append(token) else: candidate = self.sample(augmented_data, 1)[0] candidate = candidate.replace("_", " ").replace("-", " ").lower() results.append(self.align_capitalization(token, candidate)) return self.reverse_tokenizer(results)
def substitute(self, text): results = [] tokens = self.tokenizer(text) pos = nltk.pos_tag(tokens) aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = [i for i, t in enumerate(tokens)] word_idxes = self.skip_aug(word_idxes, pos) aug_idexes = self.sample(word_idxes, aug_cnt) for i, token in enumerate(tokens): # Skip if no augment for word if i not in aug_idexes: results.append(token) continue word_poses = PartOfSpeech.pos2wn(pos[i][1]) synets = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly synets.extend(self.model.synsets(pos[i][0])) else: for word_pos in word_poses: synets.extend(self.model.synsets(pos[i][0], pos=word_pos)) augmented_data = [] for synet in synets: for candidate in synet.lemma_names(): if candidate.lower() != token.lower(): augmented_data.append(candidate) if len(augmented_data) == 0: results.append(token) else: candidate = self.sample(augmented_data, 1)[0] results.append(self.align_capitalization(token, candidate)) return self.reverse_tokenizer(results)
def substitute(self, data): results = [] tokens = self.tokenizer(data) pos = self.model.pos_tag(tokens) aug_idxes = self._get_aug_idxes(pos) if aug_idxes is None: return data for i, token in enumerate(tokens): # Skip if no augment for word if i not in aug_idxes: results.append(token) continue word_poses = PartOfSpeech.pos2wn(pos[i][1]) candidates = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly candidates.extend(self.model.predict(pos[i][0])) else: for word_pos in word_poses: candidates.extend( self.model.predict(pos[i][0], pos=word_pos)) candidates = [c for c in candidates if c.lower() != token.lower()] if len(candidates) == 0: results.append(token) else: candidate = self.sample(candidates, 1)[0] candidate = candidate.replace("_", " ").replace("-", " ").lower() results.append(self.align_capitalization(token, candidate)) return self.reverse_tokenizer(results)