import re, os, numpy as np from collections import Counter from string import punctuation as punct from nltk.tokenize import word_tokenize from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factory = StemmerFactory() stemmer = factory.create_stemmer() factory = StopWordRemoverFactory() stop_words = factory.get_stop_words() factory = StemmerFactory() stemmer = factory.create_stemmer() def punct_except(p): return ''.join(set(punct) - set(p)) def remove_punc(match): word = match.group(0) neww = word[1:-1] for p in '-/': neww = neww.replace(p, ' ') return ' ' + neww + ' ' def remove_dash(match): word = match.group(0) return word[1:]
# tokenization from nltk.tokenize import sent_tokenize, word_tokenize # stemming from Sastrawi.Stemmer.StemmerFactory import StemmerFactory factory = StemmerFactory() stemmer = factory.create_stemmer() #stopword removal from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() getStopWord = factory.get_stop_words() #import custom lexicon from lexiconCustom import lexCustom # -------------import excel dataset------------- def importExcelDataSet(selectedSheet): hasil = [] labelManual = [] for i in range(2, 105): if (selectedSheet.cell(row=i, column=7).value == None): break else: hasil.append(selectedSheet.cell(row=i, column=7).value) labelManual.append(selectedSheet.cell(row=i, column=11).value)
document = re.sub(r'\W', ' ', text) document1 = re.sub(r'\s+[a-zA-Z]\s+', '', document) document2 = re.sub(r'\^[a-zA-Z]\s+', '', document1) document3 = re.sub(r"\d+", "", document2) document4 = re.sub(r'\s+', ' ', document3, flags=re.I) document5 = re.sub(r'^b\s+', '', document4) document5 = remove_all_extra_spaces(document5) document6 = document5.lower() document7 = document6.split() document8 = stemmer.stem(document6) document9 = ''.join(document8) documents.append(document9) Wtd = documents factory = StopWordRemoverFactory() stopword1 = factory.get_stop_words() more_stopwords = [ 'mohon', 'yang', 'berdasarkan', 'ada', 'kepala', 'timur', 'kantor', 'pt', ' pt', 'pihak', 'bidang', 'ata', 'ii', 'iii', 'tertanggal', 'melalui', 'jo', 'menjadi', 'terletak', 'tidak', 'ganti', 'di', 'pn', 'sdr', 'res', 'tgl', 'mengenai', 'tahun', 'su', 'ri', 'ix', 'atas', 'melalui', 'tanggapan', 'tentang', 'diduga', 'kec', 'adanya', 'ada', 'tengah', 'pernyataan', 'tembusan', 'sesuai', 'ii', 'iii', 'iiiin', 'iv', 'ix', 'vi', 'vii', 'viii', 'xii', 'xiii', 'selua', 'sh', 'bapak', '', 'hgu', 'ma', 'su', 'ham', 'perihal', 'milik', 'satu', 'tidak', ' narada', ' di', 'narada ', 'nomor', 'atas', 'pk', 'okt', 'agustus', 'juli', 'april', 'terhadap', 'kedua', 'jaya', 'untuk', 'bin', 'upaya', 'melalui', 'tentang', 'februari', 'dilakukan', 'pusat', 'selatan', 'atas', 'data', 'lp', 'dalam', 'juni', 'adanya', 'mengenai', 'jkt', 'atau', 'jawaban', 'tinggi', 'telah', 'maret', 'bapak', 'oktober', 'januari', 'juli', 'mei', 'september', 'xi', 'agung', 'ada', 'dengan',