コード例 #1
0
def _remove_stop_words(document):
    text_tokens = word_tokenize(document)

    tokens_without_sw = [
        word for word in text_tokens if not word in stopwords.words()
    ]
    rslps = RSLPStemmer()
    radical = []
    for i in tokens_without_sw:
        radical.append(rslps.stem(i))

    filtered_sentence = (" ").join(radical)
    return filtered_sentence
コード例 #2
0
def _create_frequency_table(text_string) -> dict:

    words = _cts_tokenize(text_string)
    rslps = RSLPStemmer()

    freqTable = dict()
    for word in words:
        word = rslps.stem(word)
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable
コード例 #3
0
def stemmize_text(texts):
    """Stemmize each token in the list of tokens"""
    # To count the most common for of a root term
    root2frequent = {}
    # Stemming and correct spelling
    stemmer = RSLPStemmer()
    texts_ = []
    for text in texts:
        text_ = []
        for w in text:
            stem = stemmer.stem(w)
            try:
                root2frequent[stem].update({w: 1})
            except KeyError:
                root2frequent[stem] = Counter()
                root2frequent[stem].update({w: 1})
            text_.append(stem)
        texts_.append(text_)
    return texts_, root2frequent
コード例 #4
0
        sentencas.append(wrds)
        saidas.append(tag)

# Separando as palavras não desejadas
stopwords = list(string.punctuation) + \
    nltk.corpus.stopwords.words('portuguese')
filteredWords = []

for palavra in palavras:
    if palavra not in stopwords:
        filteredWords.append(palavra)

# Stemming
stemer = RSLPStemmer()

stemmed_words = [stemer.stem(w.lower()) for w in palavras]
stemmed_words = sorted(list(set(stemmed_words)))

# Criando a bag of words
training = []
output = []

outputEmpty = [0 for _ in range(len(intencoes))]

for x, frase in enumerate(sentencas):
    bag = []
    wds = [stemer.stem(k.lower()) for k in frase]
    for w in stemmed_words:
        if w in wds:
            bag.append(1)
        else:
コード例 #5
0
class BagOfWordsCorpus:

    def __init__(self, save_path, commands, verbose, force_training=False):
        self.verbose = verbose
        self.save_path = save_path

        # English
        # self.stemmer = LancasterStemmer()
        # Portuguese
        self.stemmer = RSLPStemmer()

        self.stopwords = set(nltk.corpus.stopwords.words('portuguese') + list(punctuation))

        self.commands = commands

        if force_training:
            self.load_corpus()
        else:
            try:
                with open(save_path, "rb") as f:
                    self.words, self.labels, self.training, self.output = pickle.load(f)
            except:
                self.load_corpus()

    def load_corpus(self):

        words = []
        labels = []
        docs_x = []
        docs_y = []

        # for intent in data["intents"]:
        for key, command in self.commands.items():
            for pattern in command.patterns:

                wrds = nltk.word_tokenize(pattern)
                wrds = [word for word in wrds if word not in self.stopwords]
                wrds = [self.stemmer.stem(w.lower()) for w in wrds]

                words.extend(wrds)
                docs_x.append(wrds)
                docs_y.append(command.tag)

            if command.tag not in labels:
                labels.append(command.tag)

        words = sorted(list(set(words)))
        labels = sorted(labels)

        training = []
        output = []

        out_empty = [0 for _ in range(len(labels))]

        for x, wrds in enumerate(docs_x):
            bag = []

            for w in words:
                if w in wrds:
                    bag.append(1)
                else:
                    bag.append(0)

            output_row = out_empty[:]
            output_row[labels.index(docs_y[x])] = 1

            training.append(bag)
            output.append(output_row)

        training = np.array(training)
        output = np.array(output)

        self.words = words
        self.labels = labels
        self.training = training
        self.output = output

        with open("data/data.pickle", "wb") as f:
            pickle.dump((words, labels, training, output), f)

    def encode(self, sentence):
        bag = [0 for _ in range(len(self.words))]

        wrds = nltk.word_tokenize(sentence)
        wrds = [word for word in wrds if word not in self.stopwords]
        wrds = [self.stemmer.stem(w.lower()) for w in wrds]

        corrected_input = wrds

        # corrent user input spelling caso seja entrada digitada
        # corrected_input = []
        # for userinput_word in s_words:
        #     # spell checking
        #     # userinput_word = reduce_lengthening(userinput_word)
        #     correct_word = spelling.correction(userinput_word)
        #     corrected_input.append(correct_word)

        if self.verbose:
            print("Mensagem do usuario corregida para: {0}".format(corrected_input))

        for se in wrds:
            for i, w in enumerate(self.words):
                if w == se:
                    bag[i] = 1

        return np.array(bag)

    def reduce_lengthening(self, word):
        pattern = re.compile(r"(.)\1{2,}")
        return pattern.sub(r"\1\1", word)

    def add(self, sentence, tag):
        try:
            # read the dataset
            with open(self.save_path, "rb") as f:
                self.labels, self.training, self.output = pickle.load(f)
                x = self.encode([sentence])

                # find the phrase in the dataset
                if x in self.training:
                    return

                y = [0 for _ in range(len(self.labels))]
                y[self.labels.index(tag)] = 1

                self.training.append(x)
                self.output.append(y)
            # add the current phrase to the dataset
            with open(self.save_path, "wb") as f:
                pickle.dump((self.labels, self.training, self.output), f)
        except Exception as e:
            print(e)
コード例 #6
0
ファイル: test_utils.py プロジェクト: jesobreira/casanova
 def test_stemmer(self):
     from nltk.stem.rslp import RSLPStemmer
     lemmatizer = RSLPStemmer()
     text = 'policia'
     stem = lemmatizer.stem(text)
     assert stem == 'polic'
コード例 #7
0
ファイル: test_utils.py プロジェクト: cyberelfo/casanova
 def test_stemmer(self):
     from nltk.stem.rslp import RSLPStemmer
     lemmatizer = RSLPStemmer()
     text = 'policia'
     stem = lemmatizer.stem(text)
     assert stem == 'polic'
コード例 #8
0
ファイル: utils.py プロジェクト: cyberelfo/casanova
def lema(word):
    lemmatizer = RSLPStemmer()    
    return "%s*" % lemmatizer.stem(word)