Example #1
0
def pre_processing(doc):
	kata = ""
	datas ={}

	#stemming Sastrawi
	factory = StemmerFactory()
	stemmer = factory.create_stemmer()

	#proses Stopword removal dan tokenisasi
	for index, kalimat in enumerate(doc):
		data = []
		dataku=[]
		#membuat kalimat menjadi token/terpisah menggunakan NLTK
		tokenisasi = nltk.word_tokenize(kalimat)
		# stopWords = nltk.corpus.stopwords.words('english') + ['yang','dengan']
		# memanggil corpus daftar kalimat yang akan dihapus dari file stopwords.txt
		stopwords = open('stopwords.txt', 'r').read().split()
		for idx, word in enumerate(tokenisasi):
			# jika kata dalam komentar tidak dalam corpus stopwords.txt
			if word not in stopwords:
				# maka kata dimasukkan kedalam data
				kata = " "+word
				data.append(stemmer.stem(kata))
		datas[index] = " ".join(data)
		dataku=" ".join(data)
		# jika kata ada dalam stopwords.txt, maka kata dihapus atau dikosongkan
		kata = ""
		file = open("komentar_bersih.txt", "a")
		file.write("%s\n" %dataku)
		file.close()
	# membuat file untuk menyimpan data komentar yang sudah bersih
	# file = open("komentar_bersih.json", "w")
	# file.write("%s\n" %datas)
	# file.close()
	return datas
    def test_fungsional(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        sentence = 'malaikat-malaikat-Nya'
        expected = 'malaikat'
        output = stemmer.stem(sentence)

        if output != expected:
            raise AssertionError(str.format('output is {} instead of {}', output, expected))
Example #3
0
    def post(self):
        data = json.loads(self.request.body)
        text = data['text'].encode('utf8')

        # create stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        # stemming process
        output   = stemmer.stem(text)

        self.response.out.write(json.dumps({'output': output}))
class Test_StemmerFactoryTest(unittest.TestCase):
    def setUp(self):
        self.factory = StemmerFactory()
        return super(Test_StemmerFactoryTest, self).setUp()

    def test_createStemmerReturnStemmer(self):
        stemmer = self.factory.create_stemmer()
        self.assertIsNotNone(stemmer)
        #self.assertIsInstance(stemmer, Stemmer)

    def test_fungsional(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        sentence = 'malaikat-malaikat-Nya'
        expected = 'malaikat'
        output = stemmer.stem(sentence)

        if output != expected:
            raise AssertionError(str.format('output is {} instead of {}', output, expected))

    def test_getWordsFromFile(self):
        factory = StemmerFactory()
        factory.get_words_from_file()
Example #5
0
def load_stemmer():
    factory = StemmerFactory()
    return factory.create_stemmer()
rawdata = []
for j in range(0, 8):
    x = open(str(j + 1) + '.txt', 'r').read()
    rawdata.append(x.replace('\n', ' '))

import nltk
from nltk.tokenize import word_tokenize as token
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string, numpy as np

ST = StemmerFactory()
stemmer = ST.create_stemmer()
SW = StopWordRemoverFactory()
stop_word = SW.get_stop_words()

#rawdata
print('rawdata')
print(rawdata)

doc = []
for i in rawdata:
    temp = []
    for j in token(i):
        word = stemmer.stem(str.lower(j))
        #if word not in stop_word and len(word) > 2 and not word.startswith(tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')):
        temp.append(word)
    doc.append(temp)

dictionary = []
for i in doc:
Example #7
0
class SpellCorrector:

    NEWLINE = '\n'
    SKIP_FILES = {'cmds'}
    CORPUS_PATH = os.path.join(os.path.dirname(__file__), 'corpus/questions/')

    __control_dict = {}

    def __init__(self,
                 train=False,
                 save=False,
                 corpus_path=CORPUS_PATH,
                 threshold=2):

        self.slang_dict = pickle.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             "pickled/_slang_words.p"), "rb"))
        self.slang_dict['dr'] = 'dari'
        self.slang_dict['k'] = 'ke'
        self.slang_dict['sc'] = 'sesar'

        if train:
            create_dictionary.main()
            self.words = self.__words(corpus_path)
            self.counter = self.__counter(self.words)
            self.model = model.LanguageModel(corpus_path=corpus_path)
        else:
            self.words = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_words.p"), "rb"))
            self.counter = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_counter.p"), "rb"))
            self.model = model.LanguageModel(load=True)

        try:
            for key in self.counter:
                if self.counter[key] <= threshold:
                    self.words.remove(key)
        except:
            pass

        self.candidates_dict = {}

        # maximum edit distance per dictionary precalculation
        max_edit_distance_dictionary = 2
        prefix_length = 7

        # create object
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        # load dictionary
        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "corpus/dictionary/dictionary.txt")
        # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index, count_index, encoding="utf-8"):
            print("Dictionary file not found")
            return

        if save == True:
            self.save()

    def __read_files(self, path):
        for root, dir_names, file_names in os.walk(path):
            for path in dir_names:
                self.__read_files(os.path.join(root, path))
            for file_name in file_names:
                if file_name not in SpellCorrector.SKIP_FILES:
                    file_path = os.path.join(root, file_name)
                    if os.path.isfile(file_path):
                        lines = []
                        f = open(file_path, encoding='latin-1')
                        for line in f:
                            lines.append(line)
                        f.close()
                        content = SpellCorrector.NEWLINE.join(lines)
                        yield file_path, content

    def __words(self, corpus_path):
        words = []
        for file_name, text in self.__read_files(corpus_path):
            print("process data => " + file_name)
            words += re.findall(r'\w+', text.lower())
        return words

    def __counter(self, words):
        return Counter(words)

    def __wordProb(self, word):
        "Probability of `word`."
        return self.counter[word] / sum(self.counter.values())

    def correction(self, word):
        "Most probable spelling correction for word."
        return max(self.candidates(word), key=self.__wordProb)

    def candidates(self, word, debug=False):
        "Generate possible spelling corrections for word."
        if self.candidates_dict.get(word):
            return self.candidates_dict[word]
        else:
            # max edit distance per lookup
            # (max_edit_distance_lookup <= max_edit_distance_dictionary)
            max_edit_distance_lookup = 2
            suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
            suggestions = self.sym_spell.lookup(word, suggestion_verbosity,
                                                max_edit_distance_lookup)

            # cache it
            if SpellCorrector.__control_dict.get(word) != None:
                candidates_0 = (self.__known([word])
                                | self.__known(self.__edits1(word))
                                | self.__known(self.__edits2(word))
                                | self.__known(self.__edits3(word))
                                | {SpellCorrector.__control_dict.get(word)}
                                | {word})
            else:
                candidates_0 = (self.__known([word])
                                | self.__known(self.__edits1(word))
                                | self.__known(self.__edits2(word))
                                | self.__known(self.__edits3(word)) | {word})
            candidates_1 = set(suggestion.term for suggestion in suggestions)
            candidates = candidates_0.union(candidates_1)

            # print(candidates)

            self.candidates_dict[word] = candidates
            return candidates

    def __known(self, words):
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in words if w in self.counter)

    def __edits1(self, word):
        "All edits that are one edit away from `word`."
        letters = 'aiueon'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(inserts)

    def __edits2(self, word):
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.__edits1(word) for e2 in self.__edits1(e1))

    def __edits3(self, word):
        return (e3 for e1 in self.__edits1(word) for e2 in self.__edits1(e1)
                for e3 in self.__edits1(e2))

    def save(self, python2=False):
        if python2 is False:
            pickle.dump(
                self.words,
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_words.p"), "wb"))
            pickle.dump(
                self.counter,
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_counter.p"), "wb"))
            self.model.save()
        else:
            pickle.dump(self.words,
                        open(
                            os.path.join(os.path.dirname(__file__),
                                         "pickled/_spell_words.p"), "wb"),
                        protocol=2)
            pickle.dump(self.counter,
                        open(
                            os.path.join(os.path.dirname(__file__),
                                         "pickled/_spell_counter.p"), "wb"),
                        protocol=2)
            self.model.save()

    # TODO: implement mechanism to calculate lambda for interpolation
    def __trigram_interpolation(self, w1, w2, w3):
        lambda1 = 0.75
        lambda2 = 0.20
        lambda3 = 0.05
        return (lambda1 * self.model.sentence_prob('{} {} {}'.format(
            w1, w2, w3))) + (lambda2 * self.model.sentence_prob('{} {}'.format(
                w2, w3))) + (lambda3 * self.model.unigram_prob(w3))

    # TODO: implement mechanism to calculate lambda for interpolation
    def __bigram_interpolation(self, w1, w2):
        lambda1 = 0.80
        lambda2 = 0.20
        return (lambda1 * self.model.sentence_prob('{} {}'.format(w1, w2))) + (
            lambda2 * self.model.unigram_prob(w2))

    def __clean_text(self, words):

        cleaned_words = []
        for word in words:
            if word in self.slang_dict:
                word = self.clean_punc(self.slang_dict[word])
            word = re.sub('^days$', 'hari', word)
            word = re.sub('^day$', 'hari', word)
            word = re.sub('^weeks$', 'minggu', word)
            word = re.sub('^week$', 'minggu', word)
            word = re.sub('^months$', 'bulan', word)
            word = re.sub('^month$', 'bulan', word)
            word = re.sub('^years$', 'tahun', word)
            word = re.sub('^year$', 'tahun', word)
            word = re.sub('(?<=\d)tahun', ' tahun ', word).strip()
            word = re.sub('(?<=\d)bulan', ' bulan ', word).strip()
            word = re.sub('(?<=\d)minggu', ' minggu ', word).strip()
            word = re.sub('(?<=\d)hari', ' hari ', word).strip()
            word = re.sub('(?<=\d)jam', ' jam ', word).strip()
            word = re.sub('(?<=\d)detik', ' detik ', word).strip()
            word = re.sub('(?<=\d)th(?=($|\d+))', ' tahun ', word).strip()
            word = re.sub('(?<=\d)thn(?=($|\d+))', ' tahun ', word).strip()
            word = re.sub('(?<=\d)yrs(?=($|\d+))', ' tahun ', word).strip()
            word = re.sub('(?<=\d)bln(?=($|\d+))', ' bulan ', word).strip()
            word = re.sub('(?<=\d)mggu(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)mg(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)d(?=($|\d+))', ' hari ', word).strip()
            word = re.sub('(?<=\d)w(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)wk(?=($|\d+))', ' minggu ', word).strip()
            word = re.sub('(?<=\d)m(?=($|\d+))', ' bulan ', word).strip()
            word = re.sub('(?<=\d)jm(?=($|\d+))', ' jam ', word).strip()
            word = re.sub('(?<=\d)h(?=($|\d+))', ' hari ', word).strip()

            # memisahkan keterangan waktu dengan kata sekelilingnya, hariini --> hari ini
            if re.match("(tahun|bulan|minggu|hari|menit|detik)\w+",
                        word) is not None:
                word = re.search(
                    "(tahun|bulan|minggu|hari|menit|detik)(?=\w+)", word
                ).group(0) + ' ' + re.search(
                    "(?:(?<=tahun)|(?<=bulan)|(?<=minggu)|(?<=hari)|(?<=menit)|(?<=detik))\w+",
                    word).group(0)

            # mengubah dari ke2 k2 atau ke(angka) k(angka) --> ke 2, k 2
            if re.match("(ke)\d", word) is not None:
                word = re.search("(ke)(?=\d)",
                                 word).group(0) + ' ' + re.search(
                                     "(?<=ke)\d", word).group(0)

            if re.match("(k)\d", word) is not None:
                word = re.search("(k)(?=\d)", word).group(0) + ' ' + re.search(
                    "(?<=k)\d", word).group(0)

            # mengubah kata dari kata2 --> kata kata
            if re.match("[a-z]+2$", word) is not None:
                word = word[:-1] + ' ' + word[:-1]

            # mengubah kata dari 2kata --> 2 kata
            if re.match("^\d+[a-z]+$", word) is not None:
                word = re.search("\d+(?=\w+)",
                                 word).group(0) + ' ' + re.search(
                                     "(?<=\d)\w+", word).group(0)

            # mengubah kata dari kata2nya --> kata katanya
            if re.match("^\w+2\w+$", word) is not None:
                word = re.search("^\w+(?=2)", word).group(0) + ' ' + re.search(
                    "^\w+(?=2)", word).group(0) + re.search("(?<=2)\w+",
                                                            word).group(0)

            # mengubah kata berakhiran dok kecuali halodok, alodok, sendok, gondok e.g sayadok --> saya dok
            if re.match("(?<!halo)(?<!alo)(?<!sen)(?<!gon)dok$",
                        word) is not None:
                word = word[:-3] + ' ' + word[-3:]

            # mengubah kata berakhiran dokter kecuali halodokter, alodokter e.g sayadokter --> saya dokter
            if re.match("(?<!halo)(?<!alo)dokter$", word) is not None:
                word = word[:-6] + ' ' + word[-6:]

            # mengubah kata doksaya --> dok saya
            if re.match("^dok(?!ter)\w+", word) is not None:
                word = word[:3] + ' ' + word[3:]

            # mengubah kata doktersaya --> dokter saya
            if re.match("^dokter\w+", word) is not None:
                word = word[:6] + ' ' + word[6:]

            # mengubah kata 20x atau (angka)x --> 20 kali atau (angka) kali
            if re.match("\d+x$", word) is not None:
                word = word[:-1] + ' kali'

            if re.match("\w+x$", word) is not None:
                word = word[:-1] + 'nya'

            cleaned_words.append(word)

        cleaned = ' '.join(cleaned_words).split()

        return cleaned

    def normalize(self, sentence):

        cleaned = sentence

        if re.match("[a-zA-Z0-9 ]+\d \d bulan [a-zA-Z0-9 ]+",
                    cleaned) is not None:
            cleaned = re.search("[a-zA-Z0-9 ]+\d (?=\d bulan [a-zA-Z0-9 ]+)",cleaned).group(0) +\
            re.search("(?<=[a-zA-Z0-9 ]\d \d )bulan [a-zA-Z0-9 ]+",cleaned).group(0)

        if re.match("(?<=\w\s)x(?=\s)", cleaned) is not None:
            cleaned = re.search("[a-zA-Z ]+(?=\sx\s)",cleaned).group(0) + 'nya ' +\
            re.search("(?<=\sx\s)[a-zA-Z ]+",cleaned).group(0)

        cleaned = self.stemmer.stem(cleaned)

        return cleaned

    def clean_punc(self, sentence):
        translator = str.maketrans({key: ' ' for key in string.punctuation})
        words = [
            token.translate(translator).strip()
            for token in sentence.lower().split()
        ]
        words = ' '.join(words)
        words = [x.strip().lower() for x in words.split() if x.strip()]

        return ' '.join(words)

    def generate_candidates(self, sentence):
        # The method translate() returns a copy of the string in which all characters have been translated
        # using table (constructed with the maketrans() function in the str module),
        # optionally deleting all characters found in the string deletechars.
        translator = str.maketrans({key: ' ' for key in string.punctuation})
        words = [
            token.translate(translator).strip()
            for token in sentence.lower().split()
        ]
        words = ' '.join(words)
        words = [x.strip().lower() for x in words.split()
                 if x.strip()]  # Hapus seluruh empty char pada list

        valid = {}
        for idx, word in enumerate(words):
            if word not in self.words:
                valid[word.lower()] = 'correction_here'

        return valid

    def validate(self,
                 sentence,
                 debug=False,
                 return_candidates=False,
                 return_full_words=False):
        # The method translate() returns a copy of the string in which all characters have been translated
        # using table (constructed with the maketrans() function in the str module),
        # optionally deleting all characters found in the string deletechars.
        translator = str.maketrans({key: ' ' for key in string.punctuation})
        words = [
            token.translate(translator).strip()
            for token in sentence.lower().split()
        ]
        words = ' '.join(words)
        words = [x.strip().lower() for x in words.split()
                 if x.strip()]  # Hapus seluruh empty char pada list

        full_words = {}
        prediction_candidates = {}
        valid = []
        for word in words:
            if word in self.words:
                valid.append(word.lower())

                full_words[word] = word
            else:
                list_words = self.__clean_text([word])
                valid_ = []

                for idx, word_ in enumerate(list_words):
                    candidates = self.candidates(word_.lower())
                    if idx == 0:
                        max_word = max(
                            [w for w in candidates],
                            key=lambda word_: self.model.unigram_prob(word_))
                        valid_.append(max_word)
                        if debug:
                            print('candidates for ' + word_ + ': ' +
                                  str(candidates) + ', max prob word is ' +
                                  max_word.lower())

                    elif idx == 1:
                        max_word = max(
                            [w for w in candidates],
                            key=lambda word_: self.__bigram_interpolation(
                                valid_[0], word_))
                        valid_.append(max_word)
                        if debug:
                            print('candidates for ' + word_ + ': ' +
                                  str(candidates) + ', max prob word is ' +
                                  max_word.lower())

                    else:
                        max_word = max(
                            [w for w in candidates],
                            key=lambda word_: self.__trigram_interpolation(
                                valid_[idx - 2], valid_[idx - 1], word_))
                        valid_.append(max_word)
                        if debug:
                            print('candidates for ' + word_ + ': ' +
                                  str(candidates) + ', max prob word is ' +
                                  max_word.lower())

                if ' '.join(valid_) == 'terimakasih':
                    valid.append('terima kasih')
                    prediction_candidates[word] = 'terima kasih'
                else:
                    valid.append(' '.join(valid_))
                    prediction_candidates[word] = ' '.join(valid_)

                full_words[word] = ' '.join(valid_)

        if return_candidates:
            return prediction_candidates
        if return_full_words:
            return full_words
        else:
            return ' '.join(valid)
Example #8
0
 def stem(self, string):
     # create stemmer
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     output = stemmer.stem(string)
     return output
Example #9
0
from sklearn.svm import SVC
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
indo = stopwords.words('indonesian')

#preprosesing
dataset = pd.read_csv('training_gojek_yy.csv')
corpus = []
for i in range(0, len(dataset)):
    #re = hapus / rubah
    review = re.sub('[^a-zA-Z]', ' ', dataset['komentar'][i])
    review = review.lower()
    review = review.split()
    # Menghilangkan kata yang tidak ada di stopwords
    psi = StemmerFactory()
    ps = psi.create_stemmer()
    review = [ps.stem(word) for word in review if not word in indo]
    print(i)
    review = ' '.join(review)
    corpus.append(review)


class Analis:
    def __init__(self, training):
        self.training = training

        #tf-idf
        articles = np.array(corpus)
        labels = np.array(dataset['sentimen'])

        self.tf_vectorizer = TfidfVectorizer(min_df=4,
Example #10
0
 def setUp(self):
     stemmerFactory = StemmerFactory()
     self.stemmer = stemmerFactory.create_stemmer()
     return super(Test_StemmerTest, self).setUp()
Example #11
0
def stemming(document):
    #stemming proses
    stemmer = StemmerFactory()
    stemer = stemmer.create_stemmer()
    return stemer.stem(document)
Example #12
0
 def stemming(text):
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     return [stemmer.stem(x) for x in text]
Example #13
0
def stemming(tweet):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tweetClean = stemmer.stem(tweet)
    return tweetClean
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import string

factory_stopwrods = StopWordRemoverFactory()
stopwords = factory_stopwrods.get_stop_words()

factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()


def clean_text(text):

    # removing punctuation
    for c in string.punctuation:
        text = text.replace(c, "")

    # removing excessive whitespace
    text = " ".join(text.split())

    # text to array of word
    words = text.split()

    # removing stopwords
    words = [word for word in words if word not in stopwords]

    # stemming word in query
    words = [stemmer.stem(word) for word in words]

    return words
Example #15
0
def stem(data):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return data.apply(lambda x: [stemmer.stem(item) for item in x])
Example #16
0
def chatbot():
    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    model = load_model('chatbot_model.h5')

    intents = json.loads(open('intents.json').read())
    words = pickle.load(open('words.pkl', 'rb'))
    classes = pickle.load(open('classes.pkl', 'rb'))

    def clean_up_sentence(sentence):
        # tokenize the pattern - split words into array
        sentence_words = nltk.word_tokenize(sentence)
        # stem each word - create short form for word
        sentence_words = [
            stemmer.stem(word.lower()) for word in sentence_words
        ]
        return sentence_words

    # return bag of words array: 0 or 1 for each word in the bag that exists in the sentence

    def bow(sentence, words, show_details=True):
        # tokenize the pattern
        sentence_words = clean_up_sentence(sentence)
        # bag of words - matrix of N words, vocabulary matrix
        bag = [0] * len(words)
        for s in sentence_words:
            for i, w in enumerate(words):
                if w == s:
                    # assign 1 if current word is in the vocabulary position
                    bag[i] = 1
                    if show_details:
                        print("found in bag: %s" % w)
        return (np.array(bag))

    def predict_class(sentence, model):
        # filter out predictions below a threshold
        p = bow(sentence, words, show_details=False)
        res = model.predict(np.array([p]))[0]
        ERROR_THRESHOLD = 0.25
        results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
        # sort by strength of probability
        results.sort(key=lambda x: x[1], reverse=True)
        return_list = []
        for r in results:
            return_list.append({
                "intent": classes[r[0]],
                "probability": str(r[1])
            })
        return return_list

    def getResponse(ints, intents_json):
        tag = ints[0]['intent']
        list_of_intents = intents_json['intents']
        for i in list_of_intents:
            if (i['tag'] == tag):
                result = random.choice(i['responses'])
                break
        return result

    def chatbot_response(msg):
        ints = predict_class(msg, model)
        res = getResponse(ints, intents)
        return res

    return chatbot_response(request.json['message'])
Example #17
0



#get indonesia stop word
list_stopwords = set(stopwords.words('indonesian'))
#remove stopwords pada list token
print("Mulai remove stopwords")
def stopword(text):
  tokens_without_stopword = [word for word in text if not word in list_stopwords]
  return tokens_without_stopword

DATA['Normal'] = DATA['Normal'].apply(stopword)

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stem
print("Mulai stem")
def stem(text):
  output   = [stemmer.stem(token) for token in text]
  return output

ready = pd.DataFrame()

DATA['Normal'] = DATA['Normal'].apply(stem)
print("Selesai Stem")
ready['Normal'] = DATA['Normal']
ready['Status'] = DATA['Status']
ready.to_csv('readytfidf.csv', index=False)
Example #18
0
def stemmer_fac(string):
    fac = StemmerFactory()
    stem_cr = fac.create_stemmer()
    return stem_cr.stem(string)
Example #19
0
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factoryStem = StemmerFactory()
stemmer = factoryStem.create_stemmer()

factoryStop = StopWordRemoverFactory()
stopper = factoryStop.create_stop_word_remover()

xData = []
yData = []

rawDatasets = pd.read_csv('dataset\dataset_pool.csv', delimiter=',')

count = 0

for k in rawDatasets['Kalimat']:
    if count % 100 == 0:
        print(count)
    sentStemmed = stemmer.stem(k)

    sentStopped = sentStemmed
    '''
    temp = stopper.remove(k)

    while temp != sentStopped:
        sentStopped = temp
        temp = stopper.remove(sentStopped)
    '''
Example #20
0
    def __init__(self):
        from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

        factory = StemmerFactory()
        self.sastrawi_stemmer = factory.create_stemmer()
def stemmer(text):  # input: teks/string
    factory = StemmerFactory()  # *seperti instansiasi library
    stemmer = factory.create_stemmer()  # create stemmer
    text = text.lower()  # ganti huruf pada teks menjadi huruf kecil semua
    stem_text = stemmer.stem(text)  # stemming
    return stem_text  # output: stem_text (tiap kata sudah diubah menjadi kata dasar)
import pandas as pd
import re
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

stop_factory = StopWordRemoverFactory()
stopword = stop_factory.create_stop_word_remover()

data = pd.read_csv('hasil11.csv')
data = data[['Label', 'Isi']]


def convert(polarity):
    if polarity == 'Positif':
        return 1
    elif polarity == 'Netral':
        return 0
    else:
        return -1
from pattern.it import lemma as lemma_it
from nltk.stem.isri import ISRIStemmer
from nltk.stem import RSLPStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import tinysegmenter
from analyzer.kg_export.language.kazlemmatizer import kazakh_lemma_tokenizer

use_compound_split_german = False
if use_compound_split_german:
    import LanguageDetection

stem_ar = ISRIStemmer()
factory = StemmerFactory()
sastrawi_stemmer = factory.create_stemmer()  #arabic stemmer
stem_pt = RSLPStemmer()  #portugese_brazalian stemmer
stem_ja = tinysegmenter.TinySegmenter()
stem_nl = SnowballStemmer('dutch')
stem_ru = SnowballStemmer('russian')
stem_sv = SnowballStemmer('swedish')
stem_fr = SnowballStemmer('french')
stem_de = SnowballStemmer('german')


def read_file(filename):
    try:
        with open(filename, "r") as file_dp:
            data = json.load(file_dp)
            return data
    except Exception:
Example #24
0
 def setup_library(self):
     stemmerFactory = StemmerFactory()
     self.stemmer = stemmerFactory.create_stemmer()
Example #25
0
def stemmerFactory(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)

    return text
Example #26
0
class Sistem:
    factory = None
    stemmer = None

    def __init__(self):
        #Inialisasi Stemming
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()

    #Tahap Preprosessing
    def clean_text(self, text):
        #Tokenize
        words = word_tokenize(text.lower())
        temp = str(words)
        #RemoveNumber
        stripped = re.sub(r'\d+', '', temp)
        #RemoveKata(.com dan tanda -)
        stripped = re.sub(r'.com', '', stripped)
        stripped = re.sub(r'www', '', stripped)
        stripped = re.sub(r'\\n', '', stripped)
        #RemoveTags
        stripped = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", stripped)
        #Stemming
        temp = [self.stemmer.stem(stripped)]
        #StopwordsRemoval
        stop_words = set(stopwords.words('indonesian'))
        temp = [j for i in temp for j in i.split() if j not in stop_words]
        temp = ' '.join(temp)
        return temp

    # Tahap proses judul
    def proses_judul(self, judul):
        judul = clean_text(judul)
        hasil = str(judul)
        # print(hasil)
        remove = hasil.replace(':', '')
        kalimat = remove.replace('dtype', '')
        kalimat = kalimat.replace('Name', '')
        kalimat = kalimat.replace('Judul', '')
        kalimat = kalimat.replace('object', '')
        remove = kalimat.replace(',', '')
        remove = re.sub(r'\d+', '', remove)
        words = word_tokenize(remove)

        stop_words = set(stopwords.words('indonesian'))

        tampung_judul = []
        for x in words:
            if x not in stop_words:
                tampung_judul.append(x)
        return tampung_judul
        # print(tampung_judul)

    # Tahap proses isi
    def proses_isi(self, isi):
        isi = clean_text(isi)
        hasil = str(isi)
        remove = hasil.replace(':', '')
        remove = re.sub(r'\d+', '', remove)
        words = word_tokenize(remove)

        stop_words = set(stopwords.words('indonesian'))

        tampung_isi = []
        for y in words:
            if y not in stop_words:
                tampung_isi.append(y)
        return tampung_isi
        # print(tampung_isi)

    # Proses mencari makna kata
    def mencari_makna(self, judul, isi):
        judul = proses_judul(judul)
        # print(judul)
        isi_berita = proses_isi(isi)
        # print(isi_berita)
        synonyms = []
        result = []
        # hasil = []
        list_sinonim = []
        for i in range(0, len(judul)):
            kata = judul[i]
            for syn in wn.synsets(kata, lang="ind"):
                for l in syn.lemmas(lang="ind"):
                    hasil1 = str(l.name())
                    stem = [self.stemmer.stem(hasil1)]
                    stop_words = set(stopwords.words('indonesian'))
                    temp = [
                        j for i in stem for j in i.split()
                        if j not in stop_words
                    ]  # loop setiap kata displit dgn space, dan jika tdk termasuk stopword, maka tidak masuk divariabel temp
                    temp = ' '.join(temp)
                    pisah_kata = word_tokenize(temp)
                    for z in range(len(pisah_kata)):
                        synonyms.append(pisah_kata[z])

            for word in synonyms:
                if word not in result:
                    result.append(word)

            list_sinonim.append([])
            list_sinonim[i].append(judul[i])
            for j in range(len(result)):
                list_sinonim[i].append(result[j])

            synonyms = []
            result = []

        for a in range(len(isi_berita)):
            for b in range(len(list_sinonim)):
                for j in range(len(list_sinonim[b])):
                    if list_sinonim[b][j] == isi_berita[a]:
                        isi_berita[a] = list_sinonim[b][0]

        isi_bersih = ''
        for i in range(len(isi_berita)):
            if (i == 0):
                isi_bersih = isi_bersih + str(isi_berita[i])
            else:
                isi_bersih = isi_bersih + ' ' + str(isi_berita[i])

        return isi_bersih

    #Tahap hitung Cosine
    def cosine_sim(self, text1, text2):
        vectorizer = TfidfVectorizer(analyzer='word')
        train_vectors = vectorizer.fit_transform([text1, text2])
        #print(train_vectors)
        test_vectors = vectorizer.transform([text1, text2])
        return ((train_vectors * train_vectors.T).A)[0, 1]

    def checkup_single(self, params):

        judul = self.clean_text(params.get('judul'))
        isi = self.mencari_makna(params.get('isi'))

        # format response
        fmt_response = {}

        # empty output
        hasil = []
        #hasil_cosine = self.cosine_sim(params.get('judul'), params.get('isi'))
        hasil_cosine = self.cosine_sim(judul, isi)
        hasil.append(hasil_cosine)

        #y_pred = []
        for data in hasil:
            if data > 0.4:
                fmt_response['status'] = 'Non-clickbait'
                #temp = 0
            else:
                fmt_response['status'] = 'Clickbait'
        #temp = 1
        fmt_response['procentage'] = math.trunc(data * 100)

        return fmt_response
def stem_words(words):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    all_words = [stemmer.stem(word) for word in words]
    return all_words
Example #28
0
def cleanText(T,
              fix={},
              lemma=None,
              stops=set(),
              symbols_remove=False,
              min_charLen=2,
              fixTag=True,
              user_remove=True):
    # lang & stopS only 2 options : 'en' atau 'id'
    # symbols ASCII atau alnum
    penerjemah = Translator()
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    pattern = re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    t = re.sub(pattern, ' ', str(T))  #remove urls if any
    t = unescape(t)  # html entities fix
    if fixTag:
        t = fixTags(t)  # fix abcDef
    t = t.lower().strip()  # lowercase
    t = unidecode(t)
    #t=re.sub(r'[m]*m','m',t)
    #t=re.sub(r'[a]*a','a',t)
    '''
    t=re.sub(r'([a-z])\1+',r'\1',t)
    t=re.sub(r'gogle','google',t)
    t=re.sub(r'[weak]*wk[weak]*','',t)
    t=re.sub(r'(he){2,}','',t)
    #t=re.sub(r'[bw]*aha','ha',t)
    t=re.sub(r'(ha){2,}','',t)
    '''
    t = ''.join(''.join(s)[:2]
                for _, s in itertools.groupby(t))  # remove repetition
    t = t.replace('\n', ' ').replace('\r', ' ')
    t = sent_tokenize(t)  # sentence segmentation. String to list
    for i, K in enumerate(t):
        K = K.lower()
        #K=re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", K) #Delete Number
        K = re.sub(r"[0-9]*", " ", K)  #Delete Number
        if user_remove:
            K = re.sub('@[^\s]+', '', K)  #remove user
        #K = re.sub('@[^\s]+','AT_USER',K)
        if symbols_remove:
            #K = re.sub(r'#[a-zA-Z0-9]*','',K)
            K = re.sub(r'[^\w]', ' ', K)
        try:
            listKata, cleanList = lemma(K), []
        except:
            listKata, cleanList = K.split(), []
        if len(listKata) is not 0:
            if not isinstance(listKata[0], str):
                for token in listKata:
                    if token.text in list(fix.keys()):
                        token = fix[token.text]
                    #try:
                    if isinstance(token, str): token = lemma(token)
                    try:
                        token = penerjemah.translate(token.text,
                                                     dest='id').text
                    except:
                        pass
                    if not isinstance(token, str): token = token.text
                    #try:token=stemmer.stem(token.text)
                    #except:token=stemmer.stem(token)
                    if not lemma:
                        try:
                            token = token.lemma_
                        except:
                            if len(token) is not 0:
                                token = lemma(token)[0].lemma_
                    if stops:
                        if len(token) >= min_charLen and token not in stops:
                            if token.lower() is not "pron":
                                cleanList.append(token)
                    else:
                        if len(token) >= min_charLen:
                            cleanList.append(token)
        t[i] = ' '.join(cleanList)
    return ' '.join(t)  # Return kalimat lagi
Example #29
0
def _load_sastrawi():
    global factory, sastrawi_stemmer
    factory = StemmerFactory()
    sastrawi_stemmer = factory.create_stemmer()
Example #30
0
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import re
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary, StopWordRemover
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq
from tqdm import tqdm
#import PyPDF2

#pip install pdfplumber
import pdfplumber

#stemming (menjadi kata dasar)
stemmerFactory = StemmerFactory()
stemmer = stemmerFactory.create_stemmer()


def read_pdf(PATH, semua_halaman=True, halaman=0):
    """
    membaca file pdf per halamana

    input :
    PATH : lokasi file pdf
    halaman : halaman yg ingin dibuka
    """
    if semua_halaman:
        content = '' # new line
        with pdfplumber.open(PATH) as pdf:
            for pdf_page in pdf.pages:
                single_page_text = pdf_page.extract_text()

def index(hashs, lists):
    for i in lists:
        if i in hashs:
            hashs[i] += 1
        else:
            hashs[i] = 1


# get indonesian stopword
get_stopword = StopWordRemoverFactory()
stopwords = get_stopword.create_stop_word_remover()
# get indonesian stemming
get_stemmer = StemmerFactory()
stemmer = get_stemmer.create_stemmer()

# make hash
df, tf, idf, mains, titles = dict(), dict(), dict(), dict(), dict()

if os.path.exists('data/clean'):
    print(f'Directory : data/clean')
    for f in tqdm(Path('data/clean').glob("*.txt")):
        name = str(f).split('/')
        df[name[2]], mains[name[2]], titles[name[2]] = dict(), dict(), dict()

        File = open(f, 'r').read()
        File = stopwords.remove(File)

        sentence = File.split('\n')
        title = stemmer.stem(sentence[0].lower()).split()
def stemming(str):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(str)
Example #33
0
def Preprocessing(teks):
    print("Preprocessing Mulai")
    df = pd.read_csv("../Dataframe Siap/Dataframe2.csv")
    dt = [{"Pesan": teks, "Status": "Belum"}]
    smt = pd.DataFrame(dt)
    DATA = pd.concat([smt, df], ignore_index=True)

    DATA.head()

    #Case Folding

    def case_folding(text):
        text = text.lower()
        return text

    DATA['Pesan'] = DATA['Pesan'].apply(case_folding)

    # ------ Tokenizing ---------

    def remove_tweet_special(text):
        # remove tab, new line, ans back slice
        text = text.replace('\\t',
                            " ").replace('\\n',
                                         " ").replace('\\u',
                                                      " ").replace('\\', "")
        # remove non ASCII (emoticon, chinese word, .etc)
        text = text.encode('ascii', 'replace').decode('ascii')
        # remove mention, link, hashtag
        text = ' '.join(
            re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
        # remove incomplete URL
        return text.replace("http://", " ").replace("https://", " ")

    DATA['Hasil'] = DATA['Pesan'].apply(remove_tweet_special)

    #remove number
    def remove_number(text):
        return re.sub(r"\d+", "", text)

    DATA['Hasil'] = DATA['Hasil'].apply(remove_number)

    #remove punctuation
    def remove_punctuation(text):
        return text.translate(
            str.maketrans(string.punctuation,
                          "                                "))

    # string.punctuation,"                                "
    DATA['Hasil'] = DATA['Hasil'].apply(remove_punctuation)

    #remove whitespace leading & trailing
    def remove_whitespace_LT(text):
        return text.strip()

    DATA['Hasil'] = DATA['Hasil'].apply(remove_whitespace_LT)

    #remove multiple whitespace into single whitespace
    def remove_whitespace_multiple(text):
        return re.sub('\s+', ' ', text)

    DATA['Hasil'] = DATA['Hasil'].apply(remove_whitespace_multiple)

    # remove single char
    def remove_singl_char(text):
        return re.sub(r"\b[a-zA-Z]\b", "", text)

    DATA['Hasil'] = DATA['Hasil'].apply(remove_singl_char)

    # NLTK word rokenize
    def word_tokenize_wrapper(text):
        return word_tokenize(text)

    DATA['Hasil_tokens'] = DATA['Hasil'].apply(word_tokenize_wrapper)

    def unique(document):
        unique_word = set()
        for i in document:
            unique_word = unique_word.union(i)
        return (unique_word)

    normalizad_word = pd.read_excel("../Normalisasi.xlsx")

    normalizad_word_dict = {}

    for index, row in normalizad_word.iterrows():
        if row[0] not in normalizad_word_dict:
            normalizad_word_dict[row[0]] = row[1]

    def normalized_term(document):
        return [
            normalizad_word_dict[term]
            if term in normalizad_word_dict else term for term in document
        ]

    DATA['Normal'] = DATA['Hasil_tokens'].apply(normalized_term)

    #get indonesia stop word
    list_stopwords = set(stopwords.words('indonesian'))
    list_stopwords.remove("naik")

    #remove stopwords pada list token
    def stopword(text):
        tokens_without_stopword = [
            word for word in text if not word in list_stopwords
        ]
        return tokens_without_stopword

    DATA['Normal'] = DATA['Normal'].apply(stopword)

    # create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    # stem
    def stem(text):
        output = [stemmer.stem(token) for token in text]
        return output

    ready = pd.DataFrame()

    DATA['Normal'] = DATA['Normal'].apply(stem)
    ready['Normal'] = DATA['Normal']
    ready['Status'] = DATA['Status']
    # return ready
    ready.to_csv('readytfidf.csv', index=False)
Example #34
0
def _load_sastrawi():
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

    global factory, sastrawi_stemmer
    factory = StemmerFactory()
    sastrawi_stemmer = factory.create_stemmer()
Example #35
0
File: main.py Project: irs37/nlp
from collections import Counter

akun = ['548904824', '255409050', '480224156', '63433517', '82552414', '61379637', '79994423', '47251716',
        '260043508']  # ['@IndosatCare','@Telkomsel','@myXLCare','@triindonesia','@myXL','@IM3Ooredoo','@AXISgsm','@ask_AXIS','@simPATI']
kata_kunci = ['lambat', 'lelet', 'lola', 'lemot', 'koneksi', 'gsm', '3g', '4g', 'hsdpa', 'edge', 'jaring', 'ganggu']

cred = credentials.Certificate('kunci2.json')
firebase_admin.initialize_app(cred)

db = firestore.client()
tweet_ref = db.collection('Tweet')
kata_ref = db.collection("kata_kunci")
last_ref = db.collection("lasttweet")

factory = StemmerFactory()
stemmer = factory.create_stemmer()


def tweetstruct(user, text, t):
    data = {
        'username': user,
        'text': text,
        'time': t,
    }
    return data


def storetweet(id, input):
    try:
        ref = tweet_ref.document(id)
        ref.set(input)
def rmStem(pars):
	factory = StemmerFactory()
	stripped= strip_tags(pars)
	stemmer = factory.create_stemmer()
	clean   = stemmer.stem(str(stripped)) #Stemming
	return clean