Python StemmerFactory.create_stemmer Examples, Sastrawi.Stemmer.StemmerFactory.StemmerFactory.create_stemmer Python Examples

Example #1

0

Show file

File: Preprocessing.py Project: muhammadhabibi/CBS01

def pre_processing(doc):
	kata = ""
	datas ={}

	#stemming Sastrawi
	factory = StemmerFactory()
	stemmer = factory.create_stemmer()

	#proses Stopword removal dan tokenisasi
	for index, kalimat in enumerate(doc):
		data = []
		dataku=[]
		#membuat kalimat menjadi token/terpisah menggunakan NLTK
		tokenisasi = nltk.word_tokenize(kalimat)
		# stopWords = nltk.corpus.stopwords.words('english') + ['yang','dengan']
		# memanggil corpus daftar kalimat yang akan dihapus dari file stopwords.txt
		stopwords = open('stopwords.txt', 'r').read().split()
		for idx, word in enumerate(tokenisasi):
			# jika kata dalam komentar tidak dalam corpus stopwords.txt
			if word not in stopwords:
				# maka kata dimasukkan kedalam data
				kata = " "+word
				data.append(stemmer.stem(kata))
		datas[index] = " ".join(data)
		dataku=" ".join(data)
		# jika kata ada dalam stopwords.txt, maka kata dihapus atau dikosongkan
		kata = ""
		file = open("komentar_bersih.txt", "a")
		file.write("%s\n" %dataku)
		file.close()
	# membuat file untuk menyimpan data komentar yang sudah bersih
	# file = open("komentar_bersih.json", "w")
	# file.write("%s\n" %datas)
	# file.close()
	return datas

Example #2

0

Show file

File: stemmer_factory_test.py Project: Lamhot/python-sastrawi

    def test_fungsional(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        sentence = 'malaikat-malaikat-Nya'
        expected = 'malaikat'
        output = stemmer.stem(sentence)

        if output != expected:
            raise AssertionError(str.format('output is {} instead of {}', output, expected))

Example #3

0

Show file

File: main.py Project: har07/pystastrawi-demo

    def post(self):
        data = json.loads(self.request.body)
        text = data['text'].encode('utf8')

        # create stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        # stemming process
        output   = stemmer.stem(text)

        self.response.out.write(json.dumps({'output': output}))

Example #4

0

Show file

File: stemmer_factory_test.py Project: Lamhot/python-sastrawi

class Test_StemmerFactoryTest(unittest.TestCase):
    def setUp(self):
        self.factory = StemmerFactory()
        return super(Test_StemmerFactoryTest, self).setUp()

    def test_createStemmerReturnStemmer(self):
        stemmer = self.factory.create_stemmer()
        self.assertIsNotNone(stemmer)
        #self.assertIsInstance(stemmer, Stemmer)

    def test_fungsional(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        sentence = 'malaikat-malaikat-Nya'
        expected = 'malaikat'
        output = stemmer.stem(sentence)

        if output != expected:
            raise AssertionError(str.format('output is {} instead of {}', output, expected))

    def test_getWordsFromFile(self):
        factory = StemmerFactory()
        factory.get_words_from_file()

Example #5

0

Show file

def load_stemmer():
    factory = StemmerFactory()
    return factory.create_stemmer()

Example #6

0

Show file

File: main.py Project: deyanarajib/DM_Feature-Extraction-Using-Fuzzy-Gibbs-Latent-Dirichlet-Allocation-Model-in-Indonesian-Documents

rawdata = []
for j in range(0, 8):
    x = open(str(j + 1) + '.txt', 'r').read()
    rawdata.append(x.replace('\n', ' '))

import nltk
from nltk.tokenize import word_tokenize as token
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string, numpy as np

ST = StemmerFactory()
stemmer = ST.create_stemmer()
SW = StopWordRemoverFactory()
stop_word = SW.get_stop_words()

#rawdata
print('rawdata')
print(rawdata)

doc = []
for i in rawdata:
    temp = []
    for j in token(i):
        word = stemmer.stem(str.lower(j))
        #if word not in stop_word and len(word) > 2 and not word.startswith(tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')):
        temp.append(word)
    doc.append(temp)

dictionary = []
for i in doc:

Example #7

0

Show file

File: spell.py Project: ujicaesar95/Sentiment-analysis

class SpellCorrector:

    NEWLINE = '\n'
    SKIP_FILES = {'cmds'}
    CORPUS_PATH = os.path.join(os.path.dirname(__file__), 'corpus/questions/')

    __control_dict = {}

    def __init__(self,
                 train=False,
                 save=False,
                 corpus_path=CORPUS_PATH,
                 threshold=2):

        self.slang_dict = pickle.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             "pickled/_slang_words.p"), "rb"))
        self.slang_dict['dr'] = 'dari'
        self.slang_dict['k'] = 'ke'
        self.slang_dict['sc'] = 'sesar'

        if train:
            create_dictionary.main()
            self.words = self.__words(corpus_path)
            self.counter = self.__counter(self.words)
            self.model = model.LanguageModel(corpus_path=corpus_path)
        else:
            self.words = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_words.p"), "rb"))
            self.counter = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_counter.p"), "rb"))
            self.model = model.LanguageModel(load=True)

        try:
            for key in self.counter:
                if self.counter[key] <= threshold:
                    self.words.remove(key)
        except:
            pass

        self.candidates_dict = {}

        # maximum edit distance per dictionary precalculation
        max_edit_distance_dictionary = 2
        prefix_length = 7

        # create object
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        # load dictionary
        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "corpus/dictionary/dictionary.txt")
        # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index, count_index, encoding="utf-8"):
            print("Dictionary file not found")
            return

        if save == True:
            self.save()

    def __read_files(self, path):
        for root, dir_names, file_names in os.walk(path):
            for path in dir_names:
                self.__read_files(os.path.join(root, path))
            for file_name in file_names:
                if file_name not in SpellCorrector.SKIP_FILES:
                    file_path = os.path.join(root, file_name)
                    if os.path.isfile(file_path):
                        lines = []
                        f = open(file_path, encoding='latin-1')
                        for line in f:
                            lines.append(line)
                        f.close()
                        content = SpellCorrector.NEWLINE.join(lines)
                        yield file_path, content

    def __words(self, corpus_path):
        words = []
        for file_name, text in self.__read_files(corpus_path):
            print("process data => " + file_name)
            words += re.findall(r'\w+', text.lower())
        return words

    def __counter(self, words):
        return Counter(words)

    def __wordProb(self, word):
        "Probability of `word`."
        return self.counter[word] / sum(self.counter.values())

    def correction(self, word):
        "Most probable spelling correction for word."
        return max(self.candidates(word), key=self.__wordProb)

    def candidates(self, word, debug=False):
        "Generate possible spelling corrections for word."
        if self.candidates_dict.get(word):
            return self.candidates_dict[word]
        else:
            # max edit distance per lookup
            # (max_edit_distance_lookup <= max_edit_distance_dictionary)
            max_edit_distance_lookup = 2
            suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
            suggestions = self.sym_spell.lookup(word, suggestion_verbosity,
                                                max_edit_distance_lookup)

            # cache it
            if SpellCorrector.__control_dict.get(word) != None:
                candidates_0 = (self.__known([word])
                                | self.__known(self.__edits1(word))
                                | self.__known(self.__edits2(word))
                                | self.__known(self.__edits3(word))
                                | {SpellCorrector.__control_dict.get(word)}
                                | {word})
            else:
                candidates_0 = (self.__known([word])
                                | self.__known(self.__edits1(word))
                                | self.__known(self.__edits2(word))
                                | self.__known(self.__edits3(word)) | {word})
            candidates_1 = set(suggestion.term for suggestion in suggestions)
            candidates = candidates_0.union(candidates_1)

            # print(candidates)

            self.candidates_dict[word] = candidates
            return candidates

    def __known(self, words):
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in words if w in self.counter)

    def __edits1(self, word):
        "All edits that are one edit away from `word`."
        letters = 'aiueon'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(inserts)

    def __edits2(self, word):
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.__edits1(word) for e2 in self.__edits1(e1))

    def __edits3(self, word):
        return (e3 for e1 in self.__edits1(word) for e2 in self.__edits1(e1)
                for e3 in self.__edits1(e2))

    def save(self, python2=False):
        if python2 is False:
            pickle.dump(
                self.words,
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_words.p"), "wb"))
            pickle.dump(
                self.counter,
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_counter.p"), "wb"))
            self.model.save()
        else:
            pickle.dump(self.words,
                        open(
                            os.path.join(os.path.dirname(__file__),
                                         "pickled/_spell_words.p"), "wb"),
                        protocol=2)
            pickle.dump(self.counter,
                        open(
                            os.path.join(os.path.dirname(__file__),
                                         "pickled/_spell_counter.p"), "wb"),
                        protocol=2)
            self.model.save()

    # TODO: implement mechanism to calculate lambda for interpolation
    def __trigram_interpolation(self, w1, w2, w3):
        lambda1 = 0.75
        lambda2 = 0.20
        lambda3 = 0.05
        return (lambda1 * self.model.sentence_prob('{} {} {}'.format(
            w1, w2, w3))) + (lambda2 * self.model.sentence_prob('{} {}'.format(
                w2, w3))) + (lambda3 * self.model.unigram_prob(w3))

    # TODO: implement mechanism to calculate lambda for interpolation
    def __bigram_interpolation(self, w1, w2):
        lambda1 = 0.80
        lambda2 = 0.20
        return (lambda1 * self.model.sentence_prob('{} {}'.format(w1, w2))) + (
            lambda2 * self.model.unigram_prob(w2))

    def __clean_text(self, words):

        cleaned_words = []
        for word in words:
            if word in self.slang_dict:
                word = self.clean_punc(self.slang_dict[word])
            word = re.sub('^days$', 'hari', word)
            word = re.sub('^day$', 'hari', word)
            word = re.sub('^weeks$', 'minggu', word)
            word = re.sub('^week$', 'minggu', word)
            word = re.sub('^months$', 'bulan', word)
            word = re.sub('^month$', 'bulan', word)
            word = re.sub('^years$', 'tahun', word)
            word = re.sub('^year$', 'tahun', word)
            word = re.sub('(?<=\d)tahun', ' tahun ', word).strip()
            word = re.sub('(?<=\d)bulan', ' bulan ', word).strip()
            word = re.sub('(?<=\d)minggu', ' minggu ', word).strip()
            word = re.sub('(?<=\d)hari', ' hari ', word).strip()
            word = re.sub('(?<=\d)jam', ' jam ', word).strip()
            word = re.sub('(?<=\d)detik', ' detik ', word).strip()
            word = re.sub('(?<=\d)th(?=($|\d+))', ' tahun ', word).strip()
            word = re.sub('(?<=\d)thn(?=($|\d+))', ' tahun ', word).strip()
            word = re.sub('(?<=\d)yrs(?=($|\d+))', ' tahun ', word).strip()
            word = re.sub('(?<=\d)bln(?=($|\d+))', ' bulan ', word).strip()
            word = re.sub('(?<=\d)mggu(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)mg(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)d(?=($|\d+))', ' hari ', word).strip()
            word = re.sub('(?<=\d)w(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)wk(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)m(?=($|\d+))', ' bulan ', word).strip()
            word = re.sub('(?<=\d)jm(?=($|\d+))', ' jam ', word).strip()
            word = re.sub('(?<=\d)h(?=($|\d+))', ' hari ', word).strip()

            # memisahkan keterangan waktu dengan kata sekelilingnya, hariini --> hari ini
            if re.match("(tahun|bulan|minggu|hari|menit|detik)\w+",
                        word) is not None:
                word = re.search(
                    "(tahun|bulan|minggu|hari|menit|detik)(?=\w+)", word
                ).group(0) + ' ' + re.search(
                    "(?:(?<=tahun)|(?<=bulan)|(?<=minggu)|(?<=hari)|(?<=menit)|(?<=detik))\w+",
                    word).group(0)

            # mengubah dari ke2 k2 atau ke(angka) k(angka) --> ke 2, k 2
            if re.match("(ke)\d", word) is not None:
                word = re.search("(ke)(?=\d)",
                                 word).group(0) + ' ' + re.search(
                                     "(?<=ke)\d", word).group(0)

            if re.match("(k)\d", word) is not None:
                word = re.search("(k)(?=\d)", word).group(0) + ' ' + re.search(
                    "(?<=k)\d", word).group(0)

            # mengubah kata dari kata2 --> kata kata
            if re.match("[a-z]+2$", word) is not None:
                word = word[:-1] + ' ' + word[:-1]

            # mengubah kata dari 2kata --> 2 kata
            if re.match("^\d+[a-z]+$", word) is not None:
                word = re.search("\d+(?=\w+)",
                                 word).group(0) + ' ' + re.search(
                                     "(?<=\d)\w+", word).group(0)

            # mengubah kata dari kata2nya --> kata katanya
            if re.match("^\w+2\w+$", word) is not None:
                word = re.search("^\w+(?=2)", word).group(0) + ' ' + re.search(
                    "^\w+(?=2)", word).group(0) + re.search("(?<=2)\w+",
                                                            word).group(0)

            # mengubah kata berakhiran dok kecuali halodok, alodok, sendok, gondok e.g sayadok --> saya dok
            if re.match("(?<!halo)(?<!alo)(?<!sen)(?<!gon)dok$",
                        word) is not None:
                word = word[:-3] + ' ' + word[-3:]

            # mengubah kata berakhiran dokter kecuali halodokter, alodokter e.g sayadokter --> saya dokter
            if re.match("(?<!halo)(?<!alo)dokter$", word) is not None:
                word = word[:-6] + ' ' + word[-6:]

            # mengubah kata doksaya --> dok saya
            if re.match("^dok(?!ter)\w+", word) is not None:
                word = word[:3] + ' ' + word[3:]

            # mengubah kata doktersaya --> dokter saya
            if re.match("^dokter\w+", word) is not None:
                word = word[:6] + ' ' + word[6:]

            # mengubah kata 20x atau (angka)x --> 20 kali atau (angka) kali
            if re.match("\d+x$", word) is not None:
                word = word[:-1] + ' kali'

            if re.match("\w+x$", word) is not None:
                word = word[:-1] + 'nya'

            cleaned_words.append(word)

        cleaned = ' '.join(cleaned_words).split()

        return cleaned

    def normalize(self, sentence):

        cleaned = sentence

        if re.match("[a-zA-Z0-9 ]+\d \d bulan [a-zA-Z0-9 ]+",
                    cleaned) is not None:
            cleaned = re.search("[a-zA-Z0-9 ]+\d (?=\d bulan [a-zA-Z0-9 ]+)",cleaned).group(0) +\
            re.search("(?<=[a-zA-Z0-9 ]\d \d )bulan [a-zA-Z0-9 ]+",cleaned).group(0)

        if re.match("(?<=\w\s)x(?=\s)", cleaned) is not None:
            cleaned = re.search("[a-zA-Z ]+(?=\sx\s)",cleaned).group(0) + 'nya ' +\
            re.search("(?<=\sx\s)[a-zA-Z ]+",cleaned).group(0)

        cleaned = self.stemmer.stem(cleaned)

        return cleaned

    def clean_punc(self, sentence):
        translator = str.maketrans({key: ' ' for key in string.punctuation})
        words = [
            token.translate(translator).strip()
            for token in sentence.lower().split()
        ]
        words = ' '.join(words)
        words = [x.strip().lower() for x in words.split() if x.strip()]

        return ' '.join(words)

    def generate_candidates(self, sentence):
        # The method translate() returns a copy of the string in which all characters have been translated
        # using table (constructed with the maketrans() function in the str module),
        # optionally deleting all characters found in the string deletechars.
        translator = str.maketrans({key: ' ' for key in string.punctuation})
        words = [
            token.translate(translator).strip()
            for token in sentence.lower().split()
        ]
        words = ' '.join(words)
        words = [x.strip().lower() for x in words.split()
                 if x.strip()]  # Hapus seluruh empty char pada list

        valid = {}
        for idx, word in enumerate(words):
            if word not in self.words:
                valid[word.lower()] = 'correction_here'

        return valid

    def validate(self,
                 sentence,
                 debug=False,
                 return_candidates=False,
                 return_full_words=False):
        # The method translate() returns a copy of the string in which all characters have been translated
        # using table (constructed with the maketrans() function in the str module),
        # optionally deleting all characters found in the string deletechars.
        translator = str.maketrans({key: ' ' for key in string.punctuation})
        words = [
            token.translate(translator).strip()
            for token in sentence.lower().split()
        ]
        words = ' '.join(words)
        words = [x.strip().lower() for x in words.split()
                 if x.strip()]  # Hapus seluruh empty char pada list

        full_words = {}
        prediction_candidates = {}
        valid = []
        for word in words:
            if word in self.words:
                valid.append(word.lower())

                full_words[word] = word
            else:
                list_words = self.__clean_text([word])
                valid_ = []

                for idx, word_ in enumerate(list_words):
                    candidates = self.candidates(word_.lower())
                    if idx == 0:
                        max_word = max(
                            [w for w in candidates],
                            key=lambda word_: self.model.unigram_prob(word_))
                        valid_.append(max_word)
                        if debug:
                            print('candidates for ' + word_ + ': ' +
                                  str(candidates) + ', max prob word is ' +
                                  max_word.lower())

                    elif idx == 1:
                        max_word = max(
                            [w for w in candidates],
                            key=lambda word_: self.__bigram_interpolation(
                                valid_[0], word_))
                        valid_.append(max_word)
                        if debug:
                            print('candidates for ' + word_ + ': ' +
                                  str(candidates) + ', max prob word is ' +
                                  max_word.lower())

                    else:
                        max_word = max(
                            [w for w in candidates],
                            key=lambda word_: self.__trigram_interpolation(
                                valid_[idx - 2], valid_[idx - 1], word_))
                        valid_.append(max_word)
                        if debug:
                            print('candidates for ' + word_ + ': ' +
                                  str(candidates) + ', max prob word is ' +
                                  max_word.lower())

                if ' '.join(valid_) == 'terimakasih':
                    valid.append('terima kasih')
                    prediction_candidates[word] = 'terima kasih'
                else:
                    valid.append(' '.join(valid_))
                    prediction_candidates[word] = ' '.join(valid_)

                full_words[word] = ' '.join(valid_)

        if return_candidates:
            return prediction_candidates
        if return_full_words:
            return full_words
        else:
            return ' '.join(valid)

Example #8

0

Show file

 def stem(self, string):
     # create stemmer
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     output = stemmer.stem(string)
     return output

Example #9

0

Show file

from sklearn.svm import SVC
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
indo = stopwords.words('indonesian')

#preprosesing
dataset = pd.read_csv('training_gojek_yy.csv')
corpus = []
for i in range(0, len(dataset)):
    #re = hapus / rubah
    review = re.sub('[^a-zA-Z]', ' ', dataset['komentar'][i])
    review = review.lower()
    review = review.split()
    # Menghilangkan kata yang tidak ada di stopwords
    psi = StemmerFactory()
    ps = psi.create_stemmer()
    review = [ps.stem(word) for word in review if not word in indo]
    print(i)
    review = ' '.join(review)
    corpus.append(review)


class Analis:
    def __init__(self, training):
        self.training = training

        #tf-idf
        articles = np.array(corpus)
        labels = np.array(dataset['sentimen'])

        self.tf_vectorizer = TfidfVectorizer(min_df=4,

Example #10

0

Show file

 def setUp(self):
     stemmerFactory = StemmerFactory()
     self.stemmer = stemmerFactory.create_stemmer()
     return super(Test_StemmerTest, self).setUp()

Example #11

0

Show file

File: prosesTFPDF.py Project: reneje/TFPDF

def stemming(document):
    #stemming proses
    stemmer = StemmerFactory()
    stemer = stemmer.create_stemmer()
    return stemer.stem(document)

Example #12

0

Show file

 def stemming(text):
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     return [stemmer.stem(x) for x in text]

Example #13

0

Show file

def stemming(tweet):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tweetClean = stemmer.stem(tweet)
    return tweetClean

Example #14

0

Show file

File: text.py Project: nthnieljson/Flamingo-Search-Engine

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import string

factory_stopwrods = StopWordRemoverFactory()
stopwords = factory_stopwrods.get_stop_words()

factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()


def clean_text(text):

    # removing punctuation
    for c in string.punctuation:
        text = text.replace(c, "")

    # removing excessive whitespace
    text = " ".join(text.split())

    # text to array of word
    words = text.split()

    # removing stopwords
    words = [word for word in words if word not in stopwords]

    # stemming word in query
    words = [stemmer.stem(word) for word in words]

    return words

Example #15

0

Show file

File: app.py Project: alfhi24/FinalProjectDTI

def stem(data):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return data.apply(lambda x: [stemmer.stem(item) for item in x])

Example #16

0

Show file

def chatbot():
    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    model = load_model('chatbot_model.h5')

    intents = json.loads(open('intents.json').read())
    words = pickle.load(open('words.pkl', 'rb'))
    classes = pickle.load(open('classes.pkl', 'rb'))

    def clean_up_sentence(sentence):
        # tokenize the pattern - split words into array
        sentence_words = nltk.word_tokenize(sentence)
        # stem each word - create short form for word
        sentence_words = [
            stemmer.stem(word.lower()) for word in sentence_words
        ]
        return sentence_words

    # return bag of words array: 0 or 1 for each word in the bag that exists in the sentence

    def bow(sentence, words, show_details=True):
        # tokenize the pattern
        sentence_words = clean_up_sentence(sentence)
        # bag of words - matrix of N words, vocabulary matrix
        bag = [0] * len(words)
        for s in sentence_words:
            for i, w in enumerate(words):
                if w == s:
                    # assign 1 if current word is in the vocabulary position
                    bag[i] = 1
                    if show_details:
                        print("found in bag: %s" % w)
        return (np.array(bag))

    def predict_class(sentence, model):
        # filter out predictions below a threshold
        p = bow(sentence, words, show_details=False)
        res = model.predict(np.array([p]))[0]
        ERROR_THRESHOLD = 0.25
        results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
        # sort by strength of probability
        results.sort(key=lambda x: x[1], reverse=True)
        return_list = []
        for r in results:
            return_list.append({
                "intent": classes[r[0]],
                "probability": str(r[1])
            })
        return return_list

    def getResponse(ints, intents_json):
        tag = ints[0]['intent']
        list_of_intents = intents_json['intents']
        for i in list_of_intents:
            if (i['tag'] == tag):
                result = random.choice(i['responses'])
                break
        return result

    def chatbot_response(msg):
        ints = predict_class(msg, model)
        res = getResponse(ints, intents)
        return res

    return chatbot_response(request.json['message'])

Example #17

0

Show file




#get indonesia stop word
list_stopwords = set(stopwords.words('indonesian'))
#remove stopwords pada list token
print("Mulai remove stopwords")
def stopword(text):
  tokens_without_stopword = [word for word in text if not word in list_stopwords]
  return tokens_without_stopword

DATA['Normal'] = DATA['Normal'].apply(stopword)

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stem
print("Mulai stem")
def stem(text):
  output   = [stemmer.stem(token) for token in text]
  return output

ready = pd.DataFrame()

DATA['Normal'] = DATA['Normal'].apply(stem)
print("Selesai Stem")
ready['Normal'] = DATA['Normal']
ready['Status'] = DATA['Status']
ready.to_csv('readytfidf.csv', index=False)

Example #18

0

Show file

def stemmer_fac(string):
    fac = StemmerFactory()
    stem_cr = fac.create_stemmer()
    return stem_cr.stem(string)

Example #19

0

Show file

File: preprocessing.py Project: ukayaj620/Katalis

import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factoryStem = StemmerFactory()
stemmer = factoryStem.create_stemmer()

factoryStop = StopWordRemoverFactory()
stopper = factoryStop.create_stop_word_remover()

xData = []
yData = []

rawDatasets = pd.read_csv('dataset\dataset_pool.csv', delimiter=',')

count = 0

for k in rawDatasets['Kalimat']:
    if count % 100 == 0:
        print(count)
    sentStemmed = stemmer.stem(k)

    sentStopped = sentStemmed
    '''
    temp = stopper.remove(k)

    while temp != sentStopped:
        sentStopped = temp
        temp = stopper.remove(sentStopped)
    '''

Example #20

0

Show file

    def __init__(self):
        from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

        factory = StemmerFactory()
        self.sastrawi_stemmer = factory.create_stemmer()

Example #21

0

Show file

File: QA_Algorithms.py Project: adhiiisetiawan/simple-question-answering-system

def stemmer(text):  # input: teks/string
    factory = StemmerFactory()  # *seperti instansiasi library
    stemmer = factory.create_stemmer()  # create stemmer
    text = text.lower()  # ganti huruf pada teks menjadi huruf kecil semua
    stem_text = stemmer.stem(text)  # stemming
    return stem_text  # output: stem_text (tiap kata sudah diubah menjadi kata dasar)

Example #22

0

Show file

File: model-mnb.py Project: arham09/Lampung-TripAdvisor-Review-Scraper

import pandas as pd
import re
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

stop_factory = StopWordRemoverFactory()
stopword = stop_factory.create_stop_word_remover()

data = pd.read_csv('hasil11.csv')
data = data[['Label', 'Isi']]


def convert(polarity):
    if polarity == 'Positif':
        return 1
    elif polarity == 'Netral':
        return 0
    else:
        return -1

Example #23

0

Show file

File: Lemmatize.py Project: saraswatamit1981/KG-Generator-Tool

from pattern.it import lemma as lemma_it
from nltk.stem.isri import ISRIStemmer
from nltk.stem import RSLPStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import tinysegmenter
from analyzer.kg_export.language.kazlemmatizer import kazakh_lemma_tokenizer

use_compound_split_german = False
if use_compound_split_german:
    import LanguageDetection

stem_ar = ISRIStemmer()
factory = StemmerFactory()
sastrawi_stemmer = factory.create_stemmer()  #arabic stemmer
stem_pt = RSLPStemmer()  #portugese_brazalian stemmer
stem_ja = tinysegmenter.TinySegmenter()
stem_nl = SnowballStemmer('dutch')
stem_ru = SnowballStemmer('russian')
stem_sv = SnowballStemmer('swedish')
stem_fr = SnowballStemmer('french')
stem_de = SnowballStemmer('german')


def read_file(filename):
    try:
        with open(filename, "r") as file_dp:
            data = json.load(file_dp)
            return data
    except Exception:

Example #24

0

Show file

File: preprocessing.py Project: radityarin/skripsi

 def setup_library(self):
     stemmerFactory = StemmerFactory()
     self.stemmer = stemmerFactory.create_stemmer()

Example #25

0

Show file

def stemmerFactory(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)

    return text

Example #26

0

Show file

class Sistem:
    factory = None
    stemmer = None

    def __init__(self):
        #Inialisasi Stemming
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()

    #Tahap Preprosessing
    def clean_text(self, text):
        #Tokenize
        words = word_tokenize(text.lower())
        temp = str(words)
        #RemoveNumber
        stripped = re.sub(r'\d+', '', temp)
        #RemoveKata(.com dan tanda -)
        stripped = re.sub(r'.com', '', stripped)
        stripped = re.sub(r'www', '', stripped)
        stripped = re.sub(r'\\n', '', stripped)
        #RemoveTags
        stripped = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", stripped)
        #Stemming
        temp = [self.stemmer.stem(stripped)]
        #StopwordsRemoval
        stop_words = set(stopwords.words('indonesian'))
        temp = [j for i in temp for j in i.split() if j not in stop_words]
        temp = ' '.join(temp)
        return temp

    # Tahap proses judul
    def proses_judul(self, judul):
        judul = clean_text(judul)
        hasil = str(judul)
        # print(hasil)
        remove = hasil.replace(':', '')
        kalimat = remove.replace('dtype', '')
        kalimat = kalimat.replace('Name', '')
        kalimat = kalimat.replace('Judul', '')
        kalimat = kalimat.replace('object', '')
        remove = kalimat.replace(',', '')
        remove = re.sub(r'\d+', '', remove)
        words = word_tokenize(remove)

        stop_words = set(stopwords.words('indonesian'))

        tampung_judul = []
        for x in words:
            if x not in stop_words:
                tampung_judul.append(x)
        return tampung_judul
        # print(tampung_judul)

    # Tahap proses isi
    def proses_isi(self, isi):
        isi = clean_text(isi)
        hasil = str(isi)
        remove = hasil.replace(':', '')
        remove = re.sub(r'\d+', '', remove)
        words = word_tokenize(remove)

        stop_words = set(stopwords.words('indonesian'))

        tampung_isi = []
        for y in words:
            if y not in stop_words:
                tampung_isi.append(y)
        return tampung_isi
        # print(tampung_isi)

    # Proses mencari makna kata
    def mencari_makna(self, judul, isi):
        judul = proses_judul(judul)
        # print(judul)
        isi_berita = proses_isi(isi)
        # print(isi_berita)
        synonyms = []
        result = []
        # hasil = []
        list_sinonim = []
        for i in range(0, len(judul)):
            kata = judul[i]
            for syn in wn.synsets(kata, lang="ind"):
                for l in syn.lemmas(lang="ind"):
                    hasil1 = str(l.name())
                    stem = [self.stemmer.stem(hasil1)]
                    stop_words = set(stopwords.words('indonesian'))
                    temp = [
                        j for i in stem for j in i.split()
                        if j not in stop_words
                    ]  # loop setiap kata displit dgn space, dan jika tdk termasuk stopword, maka tidak masuk divariabel temp
                    temp = ' '.join(temp)
                    pisah_kata = word_tokenize(temp)
                    for z in range(len(pisah_kata)):
                        synonyms.append(pisah_kata[z])

            for word in synonyms:
                if word not in result:
                    result.append(word)

            list_sinonim.append([])
            list_sinonim[i].append(judul[i])
            for j in range(len(result)):
                list_sinonim[i].append(result[j])

            synonyms = []
            result = []

        for a in range(len(isi_berita)):
            for b in range(len(list_sinonim)):
                for j in range(len(list_sinonim[b])):
                    if list_sinonim[b][j] == isi_berita[a]:
                        isi_berita[a] = list_sinonim[b][0]

        isi_bersih = ''
        for i in range(len(isi_berita)):
            if (i == 0):
                isi_bersih = isi_bersih + str(isi_berita[i])
            else:
                isi_bersih = isi_bersih + ' ' + str(isi_berita[i])

        return isi_bersih

    #Tahap hitung Cosine
    def cosine_sim(self, text1, text2):
        vectorizer = TfidfVectorizer(analyzer='word')
        train_vectors = vectorizer.fit_transform([text1, text2])
        #print(train_vectors)
        test_vectors = vectorizer.transform([text1, text2])
        return ((train_vectors * train_vectors.T).A)[0, 1]

    def checkup_single(self, params):

        judul = self.clean_text(params.get('judul'))
        isi = self.mencari_makna(params.get('isi'))

        # format response
        fmt_response = {}

        # empty output
        hasil = []
        #hasil_cosine = self.cosine_sim(params.get('judul'), params.get('isi'))
        hasil_cosine = self.cosine_sim(judul, isi)
        hasil.append(hasil_cosine)

        #y_pred = []
        for data in hasil:
            if data > 0.4:
                fmt_response['status'] = 'Non-clickbait'
                #temp = 0
            else:
                fmt_response['status'] = 'Clickbait'
        #temp = 1
        fmt_response['procentage'] = math.trunc(data * 100)

        return fmt_response

Example #27

0

Show file

File: simple_sentiment.py Project: satriajiwidi/nltk_tuts

def stem_words(words):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    all_words = [stemmer.stem(word) for word in words]
    return all_words

Example #28

0

Show file

def cleanText(T,
              fix={},
              lemma=None,
              stops=set(),
              symbols_remove=False,
              min_charLen=2,
              fixTag=True,
              user_remove=True):
    # lang & stopS only 2 options : 'en' atau 'id'
    # symbols ASCII atau alnum
    penerjemah = Translator()
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    pattern = re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    t = re.sub(pattern, ' ', str(T))  #remove urls if any
    t = unescape(t)  # html entities fix
    if fixTag:
        t = fixTags(t)  # fix abcDef
    t = t.lower().strip()  # lowercase
    t = unidecode(t)
    #t=re.sub(r'[m]*m','m',t)
    #t=re.sub(r'[a]*a','a',t)
    '''
    t=re.sub(r'([a-z])\1+',r'\1',t)
    t=re.sub(r'gogle','google',t)
    t=re.sub(r'[weak]*wk[weak]*','',t)
    t=re.sub(r'(he){2,}','',t)
    #t=re.sub(r'[bw]*aha','ha',t)
    t=re.sub(r'(ha){2,}','',t)
    '''
    t = ''.join(''.join(s)[:2]
                for _, s in itertools.groupby(t))  # remove repetition
    t = t.replace('\n', ' ').replace('\r', ' ')
    t = sent_tokenize(t)  # sentence segmentation. String to list
    for i, K in enumerate(t):
        K = K.lower()
        #K=re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", K) #Delete Number
        K = re.sub(r"[0-9]*", " ", K)  #Delete Number
        if user_remove:
            K = re.sub('@[^\s]+', '', K)  #remove user
        #K = re.sub('@[^\s]+','AT_USER',K)
        if symbols_remove:
            #K = re.sub(r'#[a-zA-Z0-9]*','',K)
            K = re.sub(r'[^\w]', ' ', K)
        try:
            listKata, cleanList = lemma(K), []
        except:
            listKata, cleanList = K.split(), []
        if len(listKata) is not 0:
            if not isinstance(listKata[0], str):
                for token in listKata:
                    if token.text in list(fix.keys()):
                        token = fix[token.text]
                    #try:
                    if isinstance(token, str): token = lemma(token)
                    try:
                        token = penerjemah.translate(token.text,
                                                     dest='id').text
                    except:
                        pass
                    if not isinstance(token, str): token = token.text
                    #try:token=stemmer.stem(token.text)
                    #except:token=stemmer.stem(token)
                    if not lemma:
                        try:
                            token = token.lemma_
                        except:
                            if len(token) is not 0:
                                token = lemma(token)[0].lemma_
                    if stops:
                        if len(token) >= min_charLen and token not in stops:
                            if token.lower() is not "pron":
                                cleanList.append(token)
                    else:
                        if len(token) >= min_charLen:
                            cleanList.append(token)
        t[i] = ' '.join(cleanList)
    return ' '.join(t)  # Return kalimat lagi

Example #29

0

Show file

def _load_sastrawi():
    global factory, sastrawi_stemmer
    factory = StemmerFactory()
    sastrawi_stemmer = factory.create_stemmer()

Example #30

0

Show file

File: utils.py Project: yighu/ML_Project

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import re
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary, StopWordRemover
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq
from tqdm import tqdm
#import PyPDF2

#pip install pdfplumber
import pdfplumber

#stemming (menjadi kata dasar)
stemmerFactory = StemmerFactory()
stemmer = stemmerFactory.create_stemmer()


def read_pdf(PATH, semua_halaman=True, halaman=0):
    """
    membaca file pdf per halamana

    input :
    PATH : lokasi file pdf
    halaman : halaman yg ingin dibuka
    """
    if semua_halaman:
        content = '' # new line
        with pdfplumber.open(PATH) as pdf:
            for pdf_page in pdf.pages:
                single_page_text = pdf_page.extract_text()

Example #31

0

Show file

File: index.py Project: PrinandaRahmatullah/simple-search-engine


def index(hashs, lists):
    for i in lists:
        if i in hashs:
            hashs[i] += 1
        else:
            hashs[i] = 1


# get indonesian stopword
get_stopword = StopWordRemoverFactory()
stopwords = get_stopword.create_stop_word_remover()
# get indonesian stemming
get_stemmer = StemmerFactory()
stemmer = get_stemmer.create_stemmer()

# make hash
df, tf, idf, mains, titles = dict(), dict(), dict(), dict(), dict()

if os.path.exists('data/clean'):
    print(f'Directory : data/clean')
    for f in tqdm(Path('data/clean').glob("*.txt")):
        name = str(f).split('/')
        df[name[2]], mains[name[2]], titles[name[2]] = dict(), dict(), dict()

        File = open(f, 'r').read()
        File = stopwords.remove(File)

        sentence = File.split('\n')
        title = stemmer.stem(sentence[0].lower()).split()

Example #32

0

Show file

File: Preprocessor.py Project: kevinmel2000/sentimen-c4.5-pso

def stemming(str):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(str)

Example #33

0

Show file

def Preprocessing(teks):
    print("Preprocessing Mulai")
    df = pd.read_csv("../Dataframe Siap/Dataframe2.csv")
    dt = [{"Pesan": teks, "Status": "Belum"}]
    smt = pd.DataFrame(dt)
    DATA = pd.concat([smt, df], ignore_index=True)

    DATA.head()

    #Case Folding

    def case_folding(text):
        text = text.lower()
        return text

    DATA['Pesan'] = DATA['Pesan'].apply(case_folding)

    # ------ Tokenizing ---------

    def remove_tweet_special(text):
        # remove tab, new line, ans back slice
        text = text.replace('\\t',
                            " ").replace('\\n',
                                         " ").replace('\\u',
                                                      " ").replace('\\', "")
        # remove non ASCII (emoticon, chinese word, .etc)
        text = text.encode('ascii', 'replace').decode('ascii')
        # remove mention, link, hashtag
        text = ' '.join(
            re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
        # remove incomplete URL
        return text.replace("http://", " ").replace("https://", " ")

    DATA['Hasil'] = DATA['Pesan'].apply(remove_tweet_special)

    #remove number
    def remove_number(text):
        return re.sub(r"\d+", "", text)

    DATA['Hasil'] = DATA['Hasil'].apply(remove_number)

    #remove punctuation
    def remove_punctuation(text):
        return text.translate(
            str.maketrans(string.punctuation,
                          "                                "))

    # string.punctuation,"                                "
    DATA['Hasil'] = DATA['Hasil'].apply(remove_punctuation)

    #remove whitespace leading & trailing
    def remove_whitespace_LT(text):
        return text.strip()

    DATA['Hasil'] = DATA['Hasil'].apply(remove_whitespace_LT)

    #remove multiple whitespace into single whitespace
    def remove_whitespace_multiple(text):
        return re.sub('\s+', ' ', text)

    DATA['Hasil'] = DATA['Hasil'].apply(remove_whitespace_multiple)

    # remove single char
    def remove_singl_char(text):
        return re.sub(r"\b[a-zA-Z]\b", "", text)

    DATA['Hasil'] = DATA['Hasil'].apply(remove_singl_char)

    # NLTK word rokenize
    def word_tokenize_wrapper(text):
        return word_tokenize(text)

    DATA['Hasil_tokens'] = DATA['Hasil'].apply(word_tokenize_wrapper)

    def unique(document):
        unique_word = set()
        for i in document:
            unique_word = unique_word.union(i)
        return (unique_word)

    normalizad_word = pd.read_excel("../Normalisasi.xlsx")

    normalizad_word_dict = {}

    for index, row in normalizad_word.iterrows():
        if row[0] not in normalizad_word_dict:
            normalizad_word_dict[row[0]] = row[1]

    def normalized_term(document):
        return [
            normalizad_word_dict[term]
            if term in normalizad_word_dict else term for term in document
        ]

    DATA['Normal'] = DATA['Hasil_tokens'].apply(normalized_term)

    #get indonesia stop word
    list_stopwords = set(stopwords.words('indonesian'))
    list_stopwords.remove("naik")

    #remove stopwords pada list token
    def stopword(text):
        tokens_without_stopword = [
            word for word in text if not word in list_stopwords
        ]
        return tokens_without_stopword

    DATA['Normal'] = DATA['Normal'].apply(stopword)

    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    # stem
    def stem(text):
        output = [stemmer.stem(token) for token in text]
        return output

    ready = pd.DataFrame()

    DATA['Normal'] = DATA['Normal'].apply(stem)
    ready['Normal'] = DATA['Normal']
    ready['Status'] = DATA['Status']
    # return ready
    ready.to_csv('readytfidf.csv', index=False)

Example #34

0

Show file

File: stem.py Project: Zobaid/ImagetoText

def _load_sastrawi():
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

    global factory, sastrawi_stemmer
    factory = StemmerFactory()
    sastrawi_stemmer = factory.create_stemmer()

Example #35

0

Show file

File: main.py Project: irs37/nlp

from collections import Counter

akun = ['548904824', '255409050', '480224156', '63433517', '82552414', '61379637', '79994423', '47251716',
        '260043508']  # ['@IndosatCare','@Telkomsel','@myXLCare','@triindonesia','@myXL','@IM3Ooredoo','@AXISgsm','@ask_AXIS','@simPATI']
kata_kunci = ['lambat', 'lelet', 'lola', 'lemot', 'koneksi', 'gsm', '3g', '4g', 'hsdpa', 'edge', 'jaring', 'ganggu']

cred = credentials.Certificate('kunci2.json')
firebase_admin.initialize_app(cred)

db = firestore.client()
tweet_ref = db.collection('Tweet')
kata_ref = db.collection("kata_kunci")
last_ref = db.collection("lasttweet")

factory = StemmerFactory()
stemmer = factory.create_stemmer()


def tweetstruct(user, text, t):
    data = {
        'username': user,
        'text': text,
        'time': t,
    }
    return data


def storetweet(id, input):
    try:
        ref = tweet_ref.document(id)
        ref.set(input)

Example #36

0

Show file

File: newsModule.py Project: blankon123/news-classifier-Bahasa-Indonesia-

def rmStem(pars):
	factory = StemmerFactory()
	stripped= strip_tags(pars)
	stemmer = factory.create_stemmer()
	clean   = stemmer.stem(str(stripped)) #Stemming
	return clean