コード例 #1
0
ファイル: synonym.py プロジェクト: xiaosongyuan/nlpaug
    def skip_aug(self, token_idxes, tokens):
        results = []
        for token_idx in token_idxes:
            to_be_keep = True

            # Some word does not come with synonym/ antony. It will be excluded in lucky draw.
            if tokens[token_idx][1] in ['DT']:
                continue

            # Some words does not exisit for specific pos. Need to filter it out
            if self.aug_src == 'ppdb':
                word_poses = PartOfSpeech.constituent2pos(tokens[token_idx][1])
                if word_poses is None or len(word_poses) == 0:
                    continue

                have_candidate = False
                for word_pos in word_poses:
                    if len(
                            self.model.predict(tokens[token_idx][0],
                                               pos=word_pos)) > 0:
                        have_candidate = True
                        break

                if not have_candidate:
                    to_be_keep = False

            if to_be_keep:
                results.append(token_idx)

        return results
コード例 #2
0
ファイル: synonym.py プロジェクト: xiaojiu1414/nlpaug
    def substitute(self, data):
        results = []

        tokens = self.tokenizer(data)
        pos = self.model.pos_tag(tokens)

        aug_idxes = self._get_aug_idxes(pos)
        if aug_idxes is None:
            return data

        for i, token in enumerate(tokens):
            # Skip if no augment for word
            if i not in aug_idxes:
                results.append(token)
                continue

            word_poses = PartOfSpeech.constituent2pos(pos[i][1])
            candidates = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                candidates.extend(self.model.predict(pos[i][0]))
            else:
                for word_pos in word_poses:
                    candidates.extend(self.model.predict(pos[i][0], pos=word_pos))

            candidates = [c for c in candidates if c.lower() != token.lower()]

            if len(candidates) == 0:
                results.append(token)
            else:
                candidate = self.sample(candidates, 1)[0]
                candidate = candidate.replace("_", " ").replace("-", " ").lower()
                results.append(self.align_capitalization(token, candidate))

        return self.reverse_tokenizer(results)
コード例 #3
0
    def read(self, model_path):
        with open(model_path, 'rb') as f:
            for line in f:
                line = line.decode('utf-8')

                if '\\ x' in line or 'xc3' in line:
                    continue

                fields = line.split('|||')
                constituents = fields[0].strip()[1:-1].split('/')
                phrase = fields[1].strip()
                paraphrase = fields[2].strip()

                # filter multiple words
                if len(phrase.split()) != len(paraphrase.split()):
                    continue

                scores = []

                if len(fields) == 6:
                    # filter equivalence word ( for PPDB v2.0 only.)
                    # entailment = fields[5].strip()
                    # if entailment == 'Equivalence' and self.is_synonym:
                    #     continue

                    features = fields[3].strip().split()
                    features = [
                        feature for feature in features
                        for s in self.score_threshold if s in feature
                    ]  # filter by scheme

                    for feature in features:
                        scheme, score = feature.split('=')
                        if scheme in self.score_threshold and float(
                                score) > self.score_threshold[scheme]:
                            scores.append((scheme, score))

                    # # filter by feature/ score
                    # if len(scores) == 0:
                    #     continue

                if phrase not in self.dict:
                    self.dict[phrase] = {}

                part_of_speeches = [
                    pos for con in constituents
                    for pos in PartOfSpeech.constituent2pos(con)
                ]

                for pos in part_of_speeches:
                    if pos not in self.dict[phrase]:
                        self.dict[phrase][pos] = []

                    self.dict[phrase][pos].append({
                        'phrase': phrase,
                        'part_of_speech': pos,
                        'synonym': paraphrase,
                        'scores': scores
                    })
コード例 #4
0
ファイル: synonym.py プロジェクト: HSOFEUP/nlpaug
    def substitute(self, data):
        if not data or not data.strip():
            return data

        change_seq = 0
        doc = Doc(data, self.tokenizer(data))

        pos = self.model.pos_tag(doc.get_original_tokens())

        aug_idxes = self._get_aug_idxes(pos)
        if aug_idxes is None or len(aug_idxes) == 0:
            if self.include_detail:
                return data, []
            return data

        for aug_idx, original_token in enumerate(doc.get_original_tokens()):
            # Skip if no augment for word
            if aug_idx not in aug_idxes:
                continue

            word_poses = PartOfSpeech.constituent2pos(pos[aug_idx][1])
            candidates = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                candidates.extend(self.model.predict(pos[aug_idx][0]))
            else:
                for word_pos in word_poses:
                    candidates.extend(
                        self.model.predict(pos[aug_idx][0], pos=word_pos))

            candidates = [
                c for c in candidates if c.lower() != original_token.lower()
            ]

            if len(candidates) > 0:
                candidate = self.sample(candidates, 1)[0]
                candidate = candidate.replace("_", " ").replace("-",
                                                                " ").lower()
                substitute_token = self.align_capitalization(
                    original_token, candidate)

                if aug_idx == 0:
                    substitute_token = self.align_capitalization(
                        original_token, substitute_token)

                change_seq += 1
                doc.add_change_log(aug_idx,
                                   new_token=substitute_token,
                                   action=Action.SUBSTITUTE,
                                   change_seq=self.parent_change_seq +
                                   change_seq)

        if self.include_detail:
            return self.reverse_tokenizer(
                doc.get_augmented_tokens()), doc.get_change_logs()
        else:
            return self.reverse_tokenizer(doc.get_augmented_tokens())
コード例 #5
0
ファイル: wordnet.py プロジェクト: natuan/nlpaug
    def substitute(self, text):
        results = []

        tokens = self.tokenizer(text)

        pos = nltk.pos_tag(tokens)

        aug_idxes = self._get_aug_idxes(pos)
        if aug_idxes is None:
            return text

        for i, token in enumerate(tokens):
            # Skip if no augment for word
            if i not in aug_idxes:
                results.append(token)
                continue

            word_poses = PartOfSpeech.pos2wn(pos[i][1])
            synets = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                synets.extend(self.model.synsets(pos[i][0], lang=self.lang))
            else:
                for word_pos in word_poses:
                    synets.extend(
                        self.model.synsets(pos[i][0],
                                           pos=word_pos,
                                           lang=self.lang))

            augmented_data = []
            for synet in synets:
                candidates = []
                for lema in synet.lemmas():
                    if self.synonyms:
                        candidates.append(lema.name())
                    else:
                        if lema.antonyms():
                            candidates.append(lema.antonyms()[0].name())

                for candidate in candidates:
                    if candidate.lower() != token.lower():
                        augmented_data.append(candidate)

            if len(augmented_data) == 0:
                results.append(token)
            else:
                candidate = self.sample(augmented_data, 1)[0]
                candidate = candidate.replace("_", " ").replace("-",
                                                                " ").lower()
                results.append(self.align_capitalization(token, candidate))

        return self.reverse_tokenizer(results)
コード例 #6
0
    def get_candidates(self, tokens, token_idx):
        original_token = tokens[token_idx][0]
        word_poses = PartOfSpeech.constituent2pos(tokens[token_idx][1])
        candidates = []
        if word_poses is None or len(word_poses) == 0:
            # Use every possible words as the mapping does not defined correctly
            candidates.extend(self.model.predict(tokens[token_idx][0]))
        else:
            for word_pos in word_poses:
                candidates.extend(
                    self.model.predict(tokens[token_idx][0], pos=word_pos))

        candidates = [
            c for c in candidates if c.lower() != original_token.lower()
        ]
        return candidates
コード例 #7
0
    def substitute(self, text):
        results = []

        tokens = self.tokenizer(text)

        pos = nltk.pos_tag(tokens)

        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = [i for i, t in enumerate(tokens)]
        word_idxes = self.skip_aug(word_idxes, pos)
        aug_idexes = self.sample(word_idxes, aug_cnt)

        for i, token in enumerate(tokens):
            # Skip if no augment for word
            if i not in aug_idexes:
                results.append(token)
                continue

            word_poses = PartOfSpeech.pos2wn(pos[i][1])
            synets = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                synets.extend(self.model.synsets(pos[i][0]))
            else:
                for word_pos in word_poses:
                    synets.extend(self.model.synsets(pos[i][0], pos=word_pos))

            augmented_data = []
            for synet in synets:
                for candidate in synet.lemma_names():
                    if candidate.lower() != token.lower():
                        augmented_data.append(candidate)

            if len(augmented_data) == 0:
                results.append(token)
            else:
                candidate = self.sample(augmented_data, 1)[0]
                results.append(self.align_capitalization(token, candidate))

        return self.reverse_tokenizer(results)
コード例 #8
0
    def _get_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = self.pre_skip_aug(tokens, tuple_idx=0)
        word_idxes = self.skip_aug(word_idxes, tokens)
        if len(word_idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(
                    name=WarningName.OUT_OF_VOCABULARY,
                    code=WarningCode.WARNING_CODE_002,
                    msg=WarningMessage.NO_WORD)
                exception.output()
            return None

        aug_idexes = []
        for aug_idx in word_idxes:
            word_poses = PartOfSpeech.constituent2pos(tokens[aug_idx][1])
            candidates = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                candidates.extend(self.model.predict(tokens[aug_idx][0]))
            else:
                for word_pos in word_poses:
                    candidates.extend(
                        self.model.predict(tokens[aug_idx][0], pos=word_pos))

            candidates = [
                c for c in candidates
                if c.lower() != tokens[aug_idx][0].lower()
            ]

            if len(candidates) > 0:
                candidate = self.sample(candidates, 1)[0]
                aug_idexes.append((aug_idx, candidate))

        if len(aug_idexes) < aug_cnt:
            aug_cnt = len(aug_idexes)

        aug_idexes = self.sample(aug_idexes, aug_cnt)
        return aug_idexes