def pre_processing(doc): kata = "" datas ={} #stemming Sastrawi factory = StemmerFactory() stemmer = factory.create_stemmer() #proses Stopword removal dan tokenisasi for index, kalimat in enumerate(doc): data = [] dataku=[] #membuat kalimat menjadi token/terpisah menggunakan NLTK tokenisasi = nltk.word_tokenize(kalimat) # stopWords = nltk.corpus.stopwords.words('english') + ['yang','dengan'] # memanggil corpus daftar kalimat yang akan dihapus dari file stopwords.txt stopwords = open('stopwords.txt', 'r').read().split() for idx, word in enumerate(tokenisasi): # jika kata dalam komentar tidak dalam corpus stopwords.txt if word not in stopwords: # maka kata dimasukkan kedalam data kata = " "+word data.append(stemmer.stem(kata)) datas[index] = " ".join(data) dataku=" ".join(data) # jika kata ada dalam stopwords.txt, maka kata dihapus atau dikosongkan kata = "" file = open("komentar_bersih.txt", "a") file.write("%s\n" %dataku) file.close() # membuat file untuk menyimpan data komentar yang sudah bersih # file = open("komentar_bersih.json", "w") # file.write("%s\n" %datas) # file.close() return datas
def test_fungsional(self): factory = StemmerFactory() stemmer = factory.create_stemmer() sentence = 'malaikat-malaikat-Nya' expected = 'malaikat' output = stemmer.stem(sentence) if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected))
def post(self): data = json.loads(self.request.body) text = data['text'].encode('utf8') # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # stemming process output = stemmer.stem(text) self.response.out.write(json.dumps({'output': output}))
class Test_StemmerFactoryTest(unittest.TestCase): def setUp(self): self.factory = StemmerFactory() return super(Test_StemmerFactoryTest, self).setUp() def test_createStemmerReturnStemmer(self): stemmer = self.factory.create_stemmer() self.assertIsNotNone(stemmer) #self.assertIsInstance(stemmer, Stemmer) def test_fungsional(self): factory = StemmerFactory() stemmer = factory.create_stemmer() sentence = 'malaikat-malaikat-Nya' expected = 'malaikat' output = stemmer.stem(sentence) if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected)) def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file()
def load_stemmer(): factory = StemmerFactory() return factory.create_stemmer()
rawdata = [] for j in range(0, 8): x = open(str(j + 1) + '.txt', 'r').read() rawdata.append(x.replace('\n', ' ')) import nltk from nltk.tokenize import word_tokenize as token from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from Sastrawi.Stemmer.StemmerFactory import StemmerFactory import string, numpy as np ST = StemmerFactory() stemmer = ST.create_stemmer() SW = StopWordRemoverFactory() stop_word = SW.get_stop_words() #rawdata print('rawdata') print(rawdata) doc = [] for i in rawdata: temp = [] for j in token(i): word = stemmer.stem(str.lower(j)) #if word not in stop_word and len(word) > 2 and not word.startswith(tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')): temp.append(word) doc.append(temp) dictionary = [] for i in doc:
class SpellCorrector: NEWLINE = '\n' SKIP_FILES = {'cmds'} CORPUS_PATH = os.path.join(os.path.dirname(__file__), 'corpus/questions/') __control_dict = {} def __init__(self, train=False, save=False, corpus_path=CORPUS_PATH, threshold=2): self.slang_dict = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_slang_words.p"), "rb")) self.slang_dict['dr'] = 'dari' self.slang_dict['k'] = 'ke' self.slang_dict['sc'] = 'sesar' if train: create_dictionary.main() self.words = self.__words(corpus_path) self.counter = self.__counter(self.words) self.model = model.LanguageModel(corpus_path=corpus_path) else: self.words = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_words.p"), "rb")) self.counter = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_counter.p"), "rb")) self.model = model.LanguageModel(load=True) try: for key in self.counter: if self.counter[key] <= threshold: self.words.remove(key) except: pass self.candidates_dict = {} # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) self.factory = StemmerFactory() self.stemmer = self.factory.create_stemmer() # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary( dictionary_path, term_index, count_index, encoding="utf-8"): print("Dictionary file not found") return if save == True: self.save() def __read_files(self, path): for root, dir_names, file_names in os.walk(path): for path in dir_names: self.__read_files(os.path.join(root, path)) for file_name in file_names: if file_name not in SpellCorrector.SKIP_FILES: file_path = os.path.join(root, file_name) if os.path.isfile(file_path): lines = [] f = open(file_path, encoding='latin-1') for line in f: lines.append(line) f.close() content = SpellCorrector.NEWLINE.join(lines) yield file_path, content def __words(self, corpus_path): words = [] for file_name, text in self.__read_files(corpus_path): print("process data => " + file_name) words += re.findall(r'\w+', text.lower()) return words def __counter(self, words): return Counter(words) def __wordProb(self, word): "Probability of `word`." return self.counter[word] / sum(self.counter.values()) def correction(self, word): "Most probable spelling correction for word." return max(self.candidates(word), key=self.__wordProb) def candidates(self, word, debug=False): "Generate possible spelling corrections for word." if self.candidates_dict.get(word): return self.candidates_dict[word] else: # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = self.sym_spell.lookup(word, suggestion_verbosity, max_edit_distance_lookup) # cache it if SpellCorrector.__control_dict.get(word) != None: candidates_0 = (self.__known([word]) | self.__known(self.__edits1(word)) | self.__known(self.__edits2(word)) | self.__known(self.__edits3(word)) | {SpellCorrector.__control_dict.get(word)} | {word}) else: candidates_0 = (self.__known([word]) | self.__known(self.__edits1(word)) | self.__known(self.__edits2(word)) | self.__known(self.__edits3(word)) | {word}) candidates_1 = set(suggestion.term for suggestion in suggestions) candidates = candidates_0.union(candidates_1) # print(candidates) self.candidates_dict[word] = candidates return candidates def __known(self, words): "The subset of `words` that appear in the dictionary of WORDS." return set(w for w in words if w in self.counter) def __edits1(self, word): "All edits that are one edit away from `word`." letters = 'aiueon' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] inserts = [L + c + R for L, R in splits for c in letters] return set(inserts) def __edits2(self, word): "All edits that are two edits away from `word`." return (e2 for e1 in self.__edits1(word) for e2 in self.__edits1(e1)) def __edits3(self, word): return (e3 for e1 in self.__edits1(word) for e2 in self.__edits1(e1) for e3 in self.__edits1(e2)) def save(self, python2=False): if python2 is False: pickle.dump( self.words, open( os.path.join(os.path.dirname(__file__), "pickled/_spell_words.p"), "wb")) pickle.dump( self.counter, open( os.path.join(os.path.dirname(__file__), "pickled/_spell_counter.p"), "wb")) self.model.save() else: pickle.dump(self.words, open( os.path.join(os.path.dirname(__file__), "pickled/_spell_words.p"), "wb"), protocol=2) pickle.dump(self.counter, open( os.path.join(os.path.dirname(__file__), "pickled/_spell_counter.p"), "wb"), protocol=2) self.model.save() # TODO: implement mechanism to calculate lambda for interpolation def __trigram_interpolation(self, w1, w2, w3): lambda1 = 0.75 lambda2 = 0.20 lambda3 = 0.05 return (lambda1 * self.model.sentence_prob('{} {} {}'.format( w1, w2, w3))) + (lambda2 * self.model.sentence_prob('{} {}'.format( w2, w3))) + (lambda3 * self.model.unigram_prob(w3)) # TODO: implement mechanism to calculate lambda for interpolation def __bigram_interpolation(self, w1, w2): lambda1 = 0.80 lambda2 = 0.20 return (lambda1 * self.model.sentence_prob('{} {}'.format(w1, w2))) + ( lambda2 * self.model.unigram_prob(w2)) def __clean_text(self, words): cleaned_words = [] for word in words: if word in self.slang_dict: word = self.clean_punc(self.slang_dict[word]) word = re.sub('^days$', 'hari', word) word = re.sub('^day$', 'hari', word) word = re.sub('^weeks$', 'minggu', word) word = re.sub('^week$', 'minggu', word) word = re.sub('^months$', 'bulan', word) word = re.sub('^month$', 'bulan', word) word = re.sub('^years$', 'tahun', word) word = re.sub('^year$', 'tahun', word) word = re.sub('(?<=\d)tahun', ' tahun ', word).strip() word = re.sub('(?<=\d)bulan', ' bulan ', word).strip() word = re.sub('(?<=\d)minggu', ' minggu ', word).strip() word = re.sub('(?<=\d)hari', ' hari ', word).strip() word = re.sub('(?<=\d)jam', ' jam ', word).strip() word = re.sub('(?<=\d)detik', ' detik ', word).strip() word = re.sub('(?<=\d)th(?=($|\d+))', ' tahun ', word).strip() word = re.sub('(?<=\d)thn(?=($|\d+))', ' tahun ', word).strip() word = re.sub('(?<=\d)yrs(?=($|\d+))', ' tahun ', word).strip() word = re.sub('(?<=\d)bln(?=($|\d+))', ' bulan ', word).strip() word = re.sub('(?<=\d)mggu(?=($|\d+))', ' minggu ', word).strip() word = re.sub('(?<=\d)mg(?=($|\d+))', ' minggu ', word).strip() word = re.sub('(?<=\d)d(?=($|\d+))', ' hari ', word).strip() word = re.sub('(?<=\d)w(?=($|\d+))', ' minggu ', word).strip() word = re.sub('(?<=\d)wk(?=($|\d+))', ' minggu ', word).strip() word = re.sub('(?<=\d)m(?=($|\d+))', ' bulan ', word).strip() word = re.sub('(?<=\d)jm(?=($|\d+))', ' jam ', word).strip() word = re.sub('(?<=\d)h(?=($|\d+))', ' hari ', word).strip() # memisahkan keterangan waktu dengan kata sekelilingnya, hariini --> hari ini if re.match("(tahun|bulan|minggu|hari|menit|detik)\w+", word) is not None: word = re.search( "(tahun|bulan|minggu|hari|menit|detik)(?=\w+)", word ).group(0) + ' ' + re.search( "(?:(?<=tahun)|(?<=bulan)|(?<=minggu)|(?<=hari)|(?<=menit)|(?<=detik))\w+", word).group(0) # mengubah dari ke2 k2 atau ke(angka) k(angka) --> ke 2, k 2 if re.match("(ke)\d", word) is not None: word = re.search("(ke)(?=\d)", word).group(0) + ' ' + re.search( "(?<=ke)\d", word).group(0) if re.match("(k)\d", word) is not None: word = re.search("(k)(?=\d)", word).group(0) + ' ' + re.search( "(?<=k)\d", word).group(0) # mengubah kata dari kata2 --> kata kata if re.match("[a-z]+2$", word) is not None: word = word[:-1] + ' ' + word[:-1] # mengubah kata dari 2kata --> 2 kata if re.match("^\d+[a-z]+$", word) is not None: word = re.search("\d+(?=\w+)", word).group(0) + ' ' + re.search( "(?<=\d)\w+", word).group(0) # mengubah kata dari kata2nya --> kata katanya if re.match("^\w+2\w+$", word) is not None: word = re.search("^\w+(?=2)", word).group(0) + ' ' + re.search( "^\w+(?=2)", word).group(0) + re.search("(?<=2)\w+", word).group(0) # mengubah kata berakhiran dok kecuali halodok, alodok, sendok, gondok e.g sayadok --> saya dok if re.match("(?<!halo)(?<!alo)(?<!sen)(?<!gon)dok$", word) is not None: word = word[:-3] + ' ' + word[-3:] # mengubah kata berakhiran dokter kecuali halodokter, alodokter e.g sayadokter --> saya dokter if re.match("(?<!halo)(?<!alo)dokter$", word) is not None: word = word[:-6] + ' ' + word[-6:] # mengubah kata doksaya --> dok saya if re.match("^dok(?!ter)\w+", word) is not None: word = word[:3] + ' ' + word[3:] # mengubah kata doktersaya --> dokter saya if re.match("^dokter\w+", word) is not None: word = word[:6] + ' ' + word[6:] # mengubah kata 20x atau (angka)x --> 20 kali atau (angka) kali if re.match("\d+x$", word) is not None: word = word[:-1] + ' kali' if re.match("\w+x$", word) is not None: word = word[:-1] + 'nya' cleaned_words.append(word) cleaned = ' '.join(cleaned_words).split() return cleaned def normalize(self, sentence): cleaned = sentence if re.match("[a-zA-Z0-9 ]+\d \d bulan [a-zA-Z0-9 ]+", cleaned) is not None: cleaned = re.search("[a-zA-Z0-9 ]+\d (?=\d bulan [a-zA-Z0-9 ]+)",cleaned).group(0) +\ re.search("(?<=[a-zA-Z0-9 ]\d \d )bulan [a-zA-Z0-9 ]+",cleaned).group(0) if re.match("(?<=\w\s)x(?=\s)", cleaned) is not None: cleaned = re.search("[a-zA-Z ]+(?=\sx\s)",cleaned).group(0) + 'nya ' +\ re.search("(?<=\sx\s)[a-zA-Z ]+",cleaned).group(0) cleaned = self.stemmer.stem(cleaned) return cleaned def clean_punc(self, sentence): translator = str.maketrans({key: ' ' for key in string.punctuation}) words = [ token.translate(translator).strip() for token in sentence.lower().split() ] words = ' '.join(words) words = [x.strip().lower() for x in words.split() if x.strip()] return ' '.join(words) def generate_candidates(self, sentence): # The method translate() returns a copy of the string in which all characters have been translated # using table (constructed with the maketrans() function in the str module), # optionally deleting all characters found in the string deletechars. translator = str.maketrans({key: ' ' for key in string.punctuation}) words = [ token.translate(translator).strip() for token in sentence.lower().split() ] words = ' '.join(words) words = [x.strip().lower() for x in words.split() if x.strip()] # Hapus seluruh empty char pada list valid = {} for idx, word in enumerate(words): if word not in self.words: valid[word.lower()] = 'correction_here' return valid def validate(self, sentence, debug=False, return_candidates=False, return_full_words=False): # The method translate() returns a copy of the string in which all characters have been translated # using table (constructed with the maketrans() function in the str module), # optionally deleting all characters found in the string deletechars. translator = str.maketrans({key: ' ' for key in string.punctuation}) words = [ token.translate(translator).strip() for token in sentence.lower().split() ] words = ' '.join(words) words = [x.strip().lower() for x in words.split() if x.strip()] # Hapus seluruh empty char pada list full_words = {} prediction_candidates = {} valid = [] for word in words: if word in self.words: valid.append(word.lower()) full_words[word] = word else: list_words = self.__clean_text([word]) valid_ = [] for idx, word_ in enumerate(list_words): candidates = self.candidates(word_.lower()) if idx == 0: max_word = max( [w for w in candidates], key=lambda word_: self.model.unigram_prob(word_)) valid_.append(max_word) if debug: print('candidates for ' + word_ + ': ' + str(candidates) + ', max prob word is ' + max_word.lower()) elif idx == 1: max_word = max( [w for w in candidates], key=lambda word_: self.__bigram_interpolation( valid_[0], word_)) valid_.append(max_word) if debug: print('candidates for ' + word_ + ': ' + str(candidates) + ', max prob word is ' + max_word.lower()) else: max_word = max( [w for w in candidates], key=lambda word_: self.__trigram_interpolation( valid_[idx - 2], valid_[idx - 1], word_)) valid_.append(max_word) if debug: print('candidates for ' + word_ + ': ' + str(candidates) + ', max prob word is ' + max_word.lower()) if ' '.join(valid_) == 'terimakasih': valid.append('terima kasih') prediction_candidates[word] = 'terima kasih' else: valid.append(' '.join(valid_)) prediction_candidates[word] = ' '.join(valid_) full_words[word] = ' '.join(valid_) if return_candidates: return prediction_candidates if return_full_words: return full_words else: return ' '.join(valid)
def stem(self, string): # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() output = stemmer.stem(string) return output
from sklearn.svm import SVC from nltk.corpus import stopwords from Sastrawi.Stemmer.StemmerFactory import StemmerFactory indo = stopwords.words('indonesian') #preprosesing dataset = pd.read_csv('training_gojek_yy.csv') corpus = [] for i in range(0, len(dataset)): #re = hapus / rubah review = re.sub('[^a-zA-Z]', ' ', dataset['komentar'][i]) review = review.lower() review = review.split() # Menghilangkan kata yang tidak ada di stopwords psi = StemmerFactory() ps = psi.create_stemmer() review = [ps.stem(word) for word in review if not word in indo] print(i) review = ' '.join(review) corpus.append(review) class Analis: def __init__(self, training): self.training = training #tf-idf articles = np.array(corpus) labels = np.array(dataset['sentimen']) self.tf_vectorizer = TfidfVectorizer(min_df=4,
def setUp(self): stemmerFactory = StemmerFactory() self.stemmer = stemmerFactory.create_stemmer() return super(Test_StemmerTest, self).setUp()
def stemming(document): #stemming proses stemmer = StemmerFactory() stemer = stemmer.create_stemmer() return stemer.stem(document)
def stemming(text): factory = StemmerFactory() stemmer = factory.create_stemmer() return [stemmer.stem(x) for x in text]
def stemming(tweet): factory = StemmerFactory() stemmer = factory.create_stemmer() tweetClean = stemmer.stem(tweet) return tweetClean
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from Sastrawi.Stemmer.StemmerFactory import StemmerFactory import string factory_stopwrods = StopWordRemoverFactory() stopwords = factory_stopwrods.get_stop_words() factory_stemmer = StemmerFactory() stemmer = factory_stemmer.create_stemmer() def clean_text(text): # removing punctuation for c in string.punctuation: text = text.replace(c, "") # removing excessive whitespace text = " ".join(text.split()) # text to array of word words = text.split() # removing stopwords words = [word for word in words if word not in stopwords] # stemming word in query words = [stemmer.stem(word) for word in words] return words
def stem(data): factory = StemmerFactory() stemmer = factory.create_stemmer() return data.apply(lambda x: [stemmer.stem(item) for item in x])
def chatbot(): # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() model = load_model('chatbot_model.h5') intents = json.loads(open('intents.json').read()) words = pickle.load(open('words.pkl', 'rb')) classes = pickle.load(open('classes.pkl', 'rb')) def clean_up_sentence(sentence): # tokenize the pattern - split words into array sentence_words = nltk.word_tokenize(sentence) # stem each word - create short form for word sentence_words = [ stemmer.stem(word.lower()) for word in sentence_words ] return sentence_words # return bag of words array: 0 or 1 for each word in the bag that exists in the sentence def bow(sentence, words, show_details=True): # tokenize the pattern sentence_words = clean_up_sentence(sentence) # bag of words - matrix of N words, vocabulary matrix bag = [0] * len(words) for s in sentence_words: for i, w in enumerate(words): if w == s: # assign 1 if current word is in the vocabulary position bag[i] = 1 if show_details: print("found in bag: %s" % w) return (np.array(bag)) def predict_class(sentence, model): # filter out predictions below a threshold p = bow(sentence, words, show_details=False) res = model.predict(np.array([p]))[0] ERROR_THRESHOLD = 0.25 results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD] # sort by strength of probability results.sort(key=lambda x: x[1], reverse=True) return_list = [] for r in results: return_list.append({ "intent": classes[r[0]], "probability": str(r[1]) }) return return_list def getResponse(ints, intents_json): tag = ints[0]['intent'] list_of_intents = intents_json['intents'] for i in list_of_intents: if (i['tag'] == tag): result = random.choice(i['responses']) break return result def chatbot_response(msg): ints = predict_class(msg, model) res = getResponse(ints, intents) return res return chatbot_response(request.json['message'])
#get indonesia stop word list_stopwords = set(stopwords.words('indonesian')) #remove stopwords pada list token print("Mulai remove stopwords") def stopword(text): tokens_without_stopword = [word for word in text if not word in list_stopwords] return tokens_without_stopword DATA['Normal'] = DATA['Normal'].apply(stopword) # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # stem print("Mulai stem") def stem(text): output = [stemmer.stem(token) for token in text] return output ready = pd.DataFrame() DATA['Normal'] = DATA['Normal'].apply(stem) print("Selesai Stem") ready['Normal'] = DATA['Normal'] ready['Status'] = DATA['Status'] ready.to_csv('readytfidf.csv', index=False)
def stemmer_fac(string): fac = StemmerFactory() stem_cr = fac.create_stemmer() return stem_cr.stem(string)
import pandas as pd from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factoryStem = StemmerFactory() stemmer = factoryStem.create_stemmer() factoryStop = StopWordRemoverFactory() stopper = factoryStop.create_stop_word_remover() xData = [] yData = [] rawDatasets = pd.read_csv('dataset\dataset_pool.csv', delimiter=',') count = 0 for k in rawDatasets['Kalimat']: if count % 100 == 0: print(count) sentStemmed = stemmer.stem(k) sentStopped = sentStemmed ''' temp = stopper.remove(k) while temp != sentStopped: sentStopped = temp temp = stopper.remove(sentStopped) '''
def __init__(self): from Sastrawi.Stemmer.StemmerFactory import StemmerFactory factory = StemmerFactory() self.sastrawi_stemmer = factory.create_stemmer()
def stemmer(text): # input: teks/string factory = StemmerFactory() # *seperti instansiasi library stemmer = factory.create_stemmer() # create stemmer text = text.lower() # ganti huruf pada teks menjadi huruf kecil semua stem_text = stemmer.stem(text) # stemming return stem_text # output: stem_text (tiap kata sudah diubah menjadi kata dasar)
import pandas as pd import re import numpy as np from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from sklearn.multiclass import OneVsRestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score stem_factory = StemmerFactory() stemmer = stem_factory.create_stemmer() stop_factory = StopWordRemoverFactory() stopword = stop_factory.create_stop_word_remover() data = pd.read_csv('hasil11.csv') data = data[['Label', 'Isi']] def convert(polarity): if polarity == 'Positif': return 1 elif polarity == 'Netral': return 0 else: return -1
from pattern.it import lemma as lemma_it from nltk.stem.isri import ISRIStemmer from nltk.stem import RSLPStemmer from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from nltk import word_tokenize from nltk.stem.snowball import SnowballStemmer import tinysegmenter from analyzer.kg_export.language.kazlemmatizer import kazakh_lemma_tokenizer use_compound_split_german = False if use_compound_split_german: import LanguageDetection stem_ar = ISRIStemmer() factory = StemmerFactory() sastrawi_stemmer = factory.create_stemmer() #arabic stemmer stem_pt = RSLPStemmer() #portugese_brazalian stemmer stem_ja = tinysegmenter.TinySegmenter() stem_nl = SnowballStemmer('dutch') stem_ru = SnowballStemmer('russian') stem_sv = SnowballStemmer('swedish') stem_fr = SnowballStemmer('french') stem_de = SnowballStemmer('german') def read_file(filename): try: with open(filename, "r") as file_dp: data = json.load(file_dp) return data except Exception:
def setup_library(self): stemmerFactory = StemmerFactory() self.stemmer = stemmerFactory.create_stemmer()
def stemmerFactory(text): factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) return text
class Sistem: factory = None stemmer = None def __init__(self): #Inialisasi Stemming self.factory = StemmerFactory() self.stemmer = self.factory.create_stemmer() #Tahap Preprosessing def clean_text(self, text): #Tokenize words = word_tokenize(text.lower()) temp = str(words) #RemoveNumber stripped = re.sub(r'\d+', '', temp) #RemoveKata(.com dan tanda -) stripped = re.sub(r'.com', '', stripped) stripped = re.sub(r'www', '', stripped) stripped = re.sub(r'\\n', '', stripped) #RemoveTags stripped = re.sub("</?.*?>", " <> ", stripped) #Stemming temp = [self.stemmer.stem(stripped)] #StopwordsRemoval stop_words = set(stopwords.words('indonesian')) temp = [j for i in temp for j in i.split() if j not in stop_words] temp = ' '.join(temp) return temp # Tahap proses judul def proses_judul(self, judul): judul = clean_text(judul) hasil = str(judul) # print(hasil) remove = hasil.replace(':', '') kalimat = remove.replace('dtype', '') kalimat = kalimat.replace('Name', '') kalimat = kalimat.replace('Judul', '') kalimat = kalimat.replace('object', '') remove = kalimat.replace(',', '') remove = re.sub(r'\d+', '', remove) words = word_tokenize(remove) stop_words = set(stopwords.words('indonesian')) tampung_judul = [] for x in words: if x not in stop_words: tampung_judul.append(x) return tampung_judul # print(tampung_judul) # Tahap proses isi def proses_isi(self, isi): isi = clean_text(isi) hasil = str(isi) remove = hasil.replace(':', '') remove = re.sub(r'\d+', '', remove) words = word_tokenize(remove) stop_words = set(stopwords.words('indonesian')) tampung_isi = [] for y in words: if y not in stop_words: tampung_isi.append(y) return tampung_isi # print(tampung_isi) # Proses mencari makna kata def mencari_makna(self, judul, isi): judul = proses_judul(judul) # print(judul) isi_berita = proses_isi(isi) # print(isi_berita) synonyms = [] result = [] # hasil = [] list_sinonim = [] for i in range(0, len(judul)): kata = judul[i] for syn in wn.synsets(kata, lang="ind"): for l in syn.lemmas(lang="ind"): hasil1 = str(l.name()) stem = [self.stemmer.stem(hasil1)] stop_words = set(stopwords.words('indonesian')) temp = [ j for i in stem for j in i.split() if j not in stop_words ] # loop setiap kata displit dgn space, dan jika tdk termasuk stopword, maka tidak masuk divariabel temp temp = ' '.join(temp) pisah_kata = word_tokenize(temp) for z in range(len(pisah_kata)): synonyms.append(pisah_kata[z]) for word in synonyms: if word not in result: result.append(word) list_sinonim.append([]) list_sinonim[i].append(judul[i]) for j in range(len(result)): list_sinonim[i].append(result[j]) synonyms = [] result = [] for a in range(len(isi_berita)): for b in range(len(list_sinonim)): for j in range(len(list_sinonim[b])): if list_sinonim[b][j] == isi_berita[a]: isi_berita[a] = list_sinonim[b][0] isi_bersih = '' for i in range(len(isi_berita)): if (i == 0): isi_bersih = isi_bersih + str(isi_berita[i]) else: isi_bersih = isi_bersih + ' ' + str(isi_berita[i]) return isi_bersih #Tahap hitung Cosine def cosine_sim(self, text1, text2): vectorizer = TfidfVectorizer(analyzer='word') train_vectors = vectorizer.fit_transform([text1, text2]) #print(train_vectors) test_vectors = vectorizer.transform([text1, text2]) return ((train_vectors * train_vectors.T).A)[0, 1] def checkup_single(self, params): judul = self.clean_text(params.get('judul')) isi = self.mencari_makna(params.get('isi')) # format response fmt_response = {} # empty output hasil = [] #hasil_cosine = self.cosine_sim(params.get('judul'), params.get('isi')) hasil_cosine = self.cosine_sim(judul, isi) hasil.append(hasil_cosine) #y_pred = [] for data in hasil: if data > 0.4: fmt_response['status'] = 'Non-clickbait' #temp = 0 else: fmt_response['status'] = 'Clickbait' #temp = 1 fmt_response['procentage'] = math.trunc(data * 100) return fmt_response
def stem_words(words): factory = StemmerFactory() stemmer = factory.create_stemmer() all_words = [stemmer.stem(word) for word in words] return all_words
def cleanText(T, fix={}, lemma=None, stops=set(), symbols_remove=False, min_charLen=2, fixTag=True, user_remove=True): # lang & stopS only 2 options : 'en' atau 'id' # symbols ASCII atau alnum penerjemah = Translator() factory = StemmerFactory() stemmer = factory.create_stemmer() pattern = re.compile( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) t = re.sub(pattern, ' ', str(T)) #remove urls if any t = unescape(t) # html entities fix if fixTag: t = fixTags(t) # fix abcDef t = t.lower().strip() # lowercase t = unidecode(t) #t=re.sub(r'[m]*m','m',t) #t=re.sub(r'[a]*a','a',t) ''' t=re.sub(r'([a-z])\1+',r'\1',t) t=re.sub(r'gogle','google',t) t=re.sub(r'[weak]*wk[weak]*','',t) t=re.sub(r'(he){2,}','',t) #t=re.sub(r'[bw]*aha','ha',t) t=re.sub(r'(ha){2,}','',t) ''' t = ''.join(''.join(s)[:2] for _, s in itertools.groupby(t)) # remove repetition t = t.replace('\n', ' ').replace('\r', ' ') t = sent_tokenize(t) # sentence segmentation. String to list for i, K in enumerate(t): K = K.lower() #K=re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", K) #Delete Number K = re.sub(r"[0-9]*", " ", K) #Delete Number if user_remove: K = re.sub('@[^\s]+', '', K) #remove user #K = re.sub('@[^\s]+','AT_USER',K) if symbols_remove: #K = re.sub(r'#[a-zA-Z0-9]*','',K) K = re.sub(r'[^\w]', ' ', K) try: listKata, cleanList = lemma(K), [] except: listKata, cleanList = K.split(), [] if len(listKata) is not 0: if not isinstance(listKata[0], str): for token in listKata: if token.text in list(fix.keys()): token = fix[token.text] #try: if isinstance(token, str): token = lemma(token) try: token = penerjemah.translate(token.text, dest='id').text except: pass if not isinstance(token, str): token = token.text #try:token=stemmer.stem(token.text) #except:token=stemmer.stem(token) if not lemma: try: token = token.lemma_ except: if len(token) is not 0: token = lemma(token)[0].lemma_ if stops: if len(token) >= min_charLen and token not in stops: if token.lower() is not "pron": cleanList.append(token) else: if len(token) >= min_charLen: cleanList.append(token) t[i] = ' '.join(cleanList) return ' '.join(t) # Return kalimat lagi
def _load_sastrawi(): global factory, sastrawi_stemmer factory = StemmerFactory() sastrawi_stemmer = factory.create_stemmer()
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer import re import string from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary, StopWordRemover from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from nltk.tokenize import word_tokenize, sent_tokenize import heapq from tqdm import tqdm #import PyPDF2 #pip install pdfplumber import pdfplumber #stemming (menjadi kata dasar) stemmerFactory = StemmerFactory() stemmer = stemmerFactory.create_stemmer() def read_pdf(PATH, semua_halaman=True, halaman=0): """ membaca file pdf per halamana input : PATH : lokasi file pdf halaman : halaman yg ingin dibuka """ if semua_halaman: content = '' # new line with pdfplumber.open(PATH) as pdf: for pdf_page in pdf.pages: single_page_text = pdf_page.extract_text()
def index(hashs, lists): for i in lists: if i in hashs: hashs[i] += 1 else: hashs[i] = 1 # get indonesian stopword get_stopword = StopWordRemoverFactory() stopwords = get_stopword.create_stop_word_remover() # get indonesian stemming get_stemmer = StemmerFactory() stemmer = get_stemmer.create_stemmer() # make hash df, tf, idf, mains, titles = dict(), dict(), dict(), dict(), dict() if os.path.exists('data/clean'): print(f'Directory : data/clean') for f in tqdm(Path('data/clean').glob("*.txt")): name = str(f).split('/') df[name[2]], mains[name[2]], titles[name[2]] = dict(), dict(), dict() File = open(f, 'r').read() File = stopwords.remove(File) sentence = File.split('\n') title = stemmer.stem(sentence[0].lower()).split()
def stemming(str): factory = StemmerFactory() stemmer = factory.create_stemmer() return stemmer.stem(str)
def Preprocessing(teks): print("Preprocessing Mulai") df = pd.read_csv("../Dataframe Siap/Dataframe2.csv") dt = [{"Pesan": teks, "Status": "Belum"}] smt = pd.DataFrame(dt) DATA = pd.concat([smt, df], ignore_index=True) DATA.head() #Case Folding def case_folding(text): text = text.lower() return text DATA['Pesan'] = DATA['Pesan'].apply(case_folding) # ------ Tokenizing --------- def remove_tweet_special(text): # remove tab, new line, ans back slice text = text.replace('\\t', " ").replace('\\n', " ").replace('\\u', " ").replace('\\', "") # remove non ASCII (emoticon, chinese word, .etc) text = text.encode('ascii', 'replace').decode('ascii') # remove mention, link, hashtag text = ' '.join( re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split()) # remove incomplete URL return text.replace("http://", " ").replace("https://", " ") DATA['Hasil'] = DATA['Pesan'].apply(remove_tweet_special) #remove number def remove_number(text): return re.sub(r"\d+", "", text) DATA['Hasil'] = DATA['Hasil'].apply(remove_number) #remove punctuation def remove_punctuation(text): return text.translate( str.maketrans(string.punctuation, " ")) # string.punctuation," " DATA['Hasil'] = DATA['Hasil'].apply(remove_punctuation) #remove whitespace leading & trailing def remove_whitespace_LT(text): return text.strip() DATA['Hasil'] = DATA['Hasil'].apply(remove_whitespace_LT) #remove multiple whitespace into single whitespace def remove_whitespace_multiple(text): return re.sub('\s+', ' ', text) DATA['Hasil'] = DATA['Hasil'].apply(remove_whitespace_multiple) # remove single char def remove_singl_char(text): return re.sub(r"\b[a-zA-Z]\b", "", text) DATA['Hasil'] = DATA['Hasil'].apply(remove_singl_char) # NLTK word rokenize def word_tokenize_wrapper(text): return word_tokenize(text) DATA['Hasil_tokens'] = DATA['Hasil'].apply(word_tokenize_wrapper) def unique(document): unique_word = set() for i in document: unique_word = unique_word.union(i) return (unique_word) normalizad_word = pd.read_excel("../Normalisasi.xlsx") normalizad_word_dict = {} for index, row in normalizad_word.iterrows(): if row[0] not in normalizad_word_dict: normalizad_word_dict[row[0]] = row[1] def normalized_term(document): return [ normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document ] DATA['Normal'] = DATA['Hasil_tokens'].apply(normalized_term) #get indonesia stop word list_stopwords = set(stopwords.words('indonesian')) list_stopwords.remove("naik") #remove stopwords pada list token def stopword(text): tokens_without_stopword = [ word for word in text if not word in list_stopwords ] return tokens_without_stopword DATA['Normal'] = DATA['Normal'].apply(stopword) # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # stem def stem(text): output = [stemmer.stem(token) for token in text] return output ready = pd.DataFrame() DATA['Normal'] = DATA['Normal'].apply(stem) ready['Normal'] = DATA['Normal'] ready['Status'] = DATA['Status'] # return ready ready.to_csv('readytfidf.csv', index=False)
def _load_sastrawi(): from Sastrawi.Stemmer.StemmerFactory import StemmerFactory global factory, sastrawi_stemmer factory = StemmerFactory() sastrawi_stemmer = factory.create_stemmer()
from collections import Counter akun = ['548904824', '255409050', '480224156', '63433517', '82552414', '61379637', '79994423', '47251716', '260043508'] # ['@IndosatCare','@Telkomsel','@myXLCare','@triindonesia','@myXL','@IM3Ooredoo','@AXISgsm','@ask_AXIS','@simPATI'] kata_kunci = ['lambat', 'lelet', 'lola', 'lemot', 'koneksi', 'gsm', '3g', '4g', 'hsdpa', 'edge', 'jaring', 'ganggu'] cred = credentials.Certificate('kunci2.json') firebase_admin.initialize_app(cred) db = firestore.client() tweet_ref = db.collection('Tweet') kata_ref = db.collection("kata_kunci") last_ref = db.collection("lasttweet") factory = StemmerFactory() stemmer = factory.create_stemmer() def tweetstruct(user, text, t): data = { 'username': user, 'text': text, 'time': t, } return data def storetweet(id, input): try: ref = tweet_ref.document(id) ref.set(input)
def rmStem(pars): factory = StemmerFactory() stripped= strip_tags(pars) stemmer = factory.create_stemmer() clean = stemmer.stem(str(stripped)) #Stemming return clean