class Test_StopWordRemoverFactoryTest(unittest.TestCase): def setUp(self): self.factory = StopWordRemoverFactory() return super(Test_StopWordRemoverFactoryTest, self).setUp() def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) def test_stopwordRemoval(self): sremover = self.factory.create_stop_word_remover() self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) def test_tokens_stopwordRemoval(self): tokens = ['pergi', 'ke', 'sekolah', 'yang', 'bagus', 'adalah', 'impian'] sremover = self.factory.create_stop_word_remover() clean_tokens = sremover.remove_tokens(tokens) text = ' '.join(clean_tokens) self.assertEquals('pergi sekolah bagus impian', text) self.assertEqual('pergi', clean_tokens[0]) self.assertEqual('sekolah', clean_tokens[1]) self.assertEqual('bagus', clean_tokens[2]) self.assertEqual('impian', clean_tokens[3]) def test_execution_time(self): start = time.time() sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' sremover = self.factory.create_stop_word_remover() sremover.remove(sentence) end = time.time() # print(execution_time) execution_time = end - start self.assertTrue(execution_time < 1)
def preproses(request): baca_db = CrawlDetikNews.objects.all() kounter = 0 for baca in baca_db: kounter += 1 if kounter > 497 and kounter <= 500: # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # stemming process sentence = baca.headline + " " + baca.content output = stemmer.stem(sentence) baca.stemming = output # ------------------- Stopword Removal fa = StopWordRemoverFactory() stopword = fa.create_stop_word_remover() kalimat = output stop = stopword.remove(kalimat) stop = stop.replace(' - ', ' ') output = stop baca.stopword = output baca.save() return render(request, 'beranda/preprocessing.html', { "rootword": output, "ori": sentence })
def stopwords_removal(self, text, stopwords, output_stopwords): with open(dataOutputPath, encoding='utf-8') as f: text = f.read() f.close() with open(stopwords, encoding='utf-8') as f: list_stopwords = f.read() f.close() stop_factory = StopWordRemoverFactory() more_stopwords = list_stopwords.split("\n") #Tambahkan Stopword Baru data = stop_factory.get_stop_words() + more_stopwords stopword = stop_factory.create_stop_word_remover() remove_stopwords = stopword.remove(text) with open(pathStopwords, 'w', encoding='utf-8') as f: f.write(remove_stopwords) f.close() print( "Stopwords Removal success!\nCount Words Frequency on process...") return remove_stopwords
def predict(): ''' For rendering results on HTML GUI ''' model = pickle.load(open('model_news.pkl', 'rb')) cv_ = pickle.load(open('news_cv.pkl', 'rb')) enc_ = pickle.load(open('enc_news.pkl', 'rb')) y_for_test = request.form['news_'] y_for_test = pd.Series(y_for_test) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() factory = StemmerFactory() stemmer = factory.create_stemmer() xy = [] for i in y_for_test.values: stops_ = stopword.remove(stemmer.stem(i)) wordy_ = '' for st in stops_.split(' '): if st.isalpha(): wordy_ += st + ' ' xy.append(wordy_) x_t = cv_.transform(xy) resu = model.predict(x_t) print('prediction:') s = [str(i) for i in list(enc_.inverse_transform(resu))] res = ", ".join(s) return render_template( 'index.html', prediction_text='Topiknya adalah {}. Ya kan?'.format(res))
def normalisasi(self, dokumen): store = [] for artikel in dokumen: split = artikel.split() split.sort() store.append(split) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() store_n = [] for x, sentence in enumerate(store): store_n.append([]) for word in sentence: plus = stopword.remove(word) has = self.hasNumbers(plus) if plus != '' and \ has == False and \ '-' not in plus and \ len(plus) <= 13 and \ len(plus) >= 3: store_n[x].append(plus) for i, kalimat in enumerate(store_n): for j, kata in enumerate(kalimat): if kata == '': del store_n[i][j] return store_n
def preprocessing_text(text): encoded_string = text.encode("ascii", "ignore") #remove asci text = encoded_string.decode() #remove asci text = text.lower() #lowercase text = ''.join([i for i in text if not i.isdigit()]) #remove number #text = ''.join([i for i in text if i not in text.punctuation]) text = re.sub(r'http\S+', '', text) #remove url factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) text = re.sub("[^\w\s]", '', text) #remove punctuation text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation return text
class Test_StopWordRemoverFactoryTest(unittest.TestCase): def setUp(self): self.factory = StopWordRemoverFactory() return super(Test_StopWordRemoverFactoryTest, self).setUp() def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover)
def search(): if request.method == 'GET': data = data = json.load(open('data/testing_data.json', encoding="utf-8")) algo = request.args.get('algo', '1') query1 = request.args.get('q1', '') query2 = request.args.get('q2', '') query3 = request.args.get('q3', '') max_response = request.args.get("max_resp", 10) start = time.time() print("timer start") queries = query1 + " " + query2 + " " + query3 #remove stopwords factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() stopword.remove(queries) #stemming factory = StemmerFactory() stemmer = factory.create_stemmer() queries = stemmer.stem(queries) if algo == '1': response = jsonify(tfidf(data, queries, max_response)) else: response = jsonify(lsa(data, queries, max_response)) end = time.time() print("timer stop. Runtime: ") print(end - start) return response
def preprocessing_text(text): encoded_string = text.encode("ascii", "ignore") #remove asci text = encoded_string.decode() #remove asci text = re.sub(r'http\S+', '', text) #remove url text = text.lower() #lowercase text = ''.join([i for i in text if not i.isdigit()]) #remove number #text = ''.join([i for i in text if i not in text.punctuation]) #text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*“”‘’_~+=|\t\n''' for char in text: if char in punctuations: text = text.replace(char, "") factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) return text
def clean_data(self, path): path = str(path) bye = open('ujaranbaru.txt', 'w+') with open(str(path), 'r') as readFile: reader = csv.reader(readFile) lines = list(reader) for i in range(len(lines)): str1 = ''.join(lines[i]) caseF = str1.casefold() Runame = re.sub('@[^\s]+', '', caseF) Rhashtag = re.sub('#[^\s]+', '', Runame) CleanNumber = ''.join([i for i in Rhashtag if not i.isdigit()]) line = re.sub('[(),\'.!$]', '', CleanNumber) link = re.sub('https[^\s]+', '', line) garing = re.sub('\\\[^\s]+', '', link) removeRT = garing.replace("rt", "") removespace = removeRT.lstrip() factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() stopw = stopword.remove(removespace) factory = StemmerFactory() stemmer = factory.create_stemmer() steam = stemmer.stem(stopw) text = steam.split() if (len(text) >= 5): bye.write(steam + '\n') self.progressBar_6.setValue((i + 1) / len(lines) * 100) bye.close()
def blogging(): # Insert to Blog DB _title = request.form["title"] _content = request.form["content"] _link = request.form["link"] _date = request.form["date"] # Normalizing _title = _title.replace("'", "\"") _content2 = _content.replace("'", "") _content = _content.replace("'", "") # ***PRE-PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _content2 = stopword.remove(_content2) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _content2 = stemmer.stem(_content2) BlogRepository().insert(Blog(_title, _content, _link, _content2, _date)) # Insert to Tf DB return render_template('index.html')
def preprocess(text): token_list = [] vectorizer = TfidfVectorizer() factory = StemmerFactory() stop_factory = StopWordRemoverFactory() ps = factory.create_stemmer() stopword = stop_factory.create_stop_word_remover() pattern = r'[0-9]' text = re.sub(pattern, '', text) shorthand = [" no "," sy "," dg "," kk "," ga "," sdh "," bgn "," klw "," bbrp "," kbr "," dri "," dr "," jgn "," yg "," tdk ", " jt "," gk ", " atw "," klu ", " tsb "," utk "," tlh "] replacement= [" nomor "," saya "," dengan "," kakak "," tidak "," sudah "," bangun "," kalau "," beberapa "," kabar " ," dari "," dari "," jangan "," yang "," tidak "," juta "," tidak "," atau "," kalau "," tersebut "," untuk ", " telah"] for i in range(len(shorthand)): # re.sub(shorthand[i],replacement[i],text) text = text.replace(shorthand[i],replacement[i]) text = stopword.remove(text) text = re.sub(r'\b\w{1,2}\b', '', text) processed_text = ' '.join(ps.stem(token) for token in word_tokenize(text)) token_list.append(processed_text) x = vectorizer.fit_transform(token_list) return processed_text
def run(self): i = 1 status = True while status: df = pd.read_csv('data.csv', sep=",") data = list(df["indonesia"].astype(str).str.lower()) kd = [] i = 1 for d in data: # StopWordRemover factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() # Tokenize stop = nltk.tokenize.word_tokenize(stopword.remove(str(d))) # Stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() katadasar = stemmer.stem(str(stop)) kd.append(katadasar) self.update_progressbar.emit(i + 1) i = i + 1 no = 0 with open('post-preprocessing.csv', 'w', newline='', encoding='utf-8-sig') as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow(["teks"]) for d in kd: spamwriter.writerow([kd[no]]) no = no + 1 status = False
def transform(self, X): tweet_final = [] for tweet in X: tweet = tweet[0] tweet = tweet.lower() regex = re.compile('\shttp.+\s') tweet = regex.sub('', tweet) regex = re.compile('\shttps.+\s') tweet = regex.sub('', tweet) regex = re.compile('\spic.+\s') tweet = regex.sub('', tweet) regex = re.compile('\sftp.+\s') tweet = regex.sub('', tweet) regex = re.compile('[^a-zA-Z0-9]') tweet = regex.sub(' ', tweet) regex = re.compile('[0-9]+') tweet = regex.sub('', tweet) regex = re.compile(r'\W*\b\w{1,3}\b') tweet = regex.sub('', tweet) regex = re.compile('rt\s') tweet = regex.sub(' ', tweet) #remove stopwords stop_factory = StopWordRemoverFactory() stopword = stop_factory.create_stop_word_remover() tweet = stopword.remove(tweet) #stemming stem_factory = StemmerFactory() stemmer = stem_factory.create_stemmer() tweet = stemmer.stem(tweet) tweet_final.append(tweet) tweet_final = np.array(tweet_final) return tweet_final
def pre_processing(doc): # NLP Text Pre-processing # Regular Expression - Menghapus karakter angka dan tanda baca tokenizer = RegexpTokenizer(r'\w+') texts = [] # Stemmer - Menghilangkan infleksi/kata imbuhan ke dalam bentuk dasar stFactory = StemmerFactory() st = stFactory.create_stemmer() # Stopword - Menghilangkan kata umum yang tidak memiliki makna swFactory = StopWordRemoverFactory() sw = swFactory.create_stop_word_remover() for i in doc: raw = i.lower() # Merubah kata menjadi huruf kecil tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if i in sw.remove(i)] stemmed_tokens = [st.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) # Detokenize detok = [] for row in texts: sq = '' for word in row: sq = sq + ' ' + word detok.append(sq) texts = detok return texts[0]
def stopword(self): factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() filtering = [] for artikel in self.dokumen: plus = stopword.remove(artikel.isi) filtering.append(plus) return filtering
def StopwordRemoval(self, text): factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = ' '.join(text) removed = stopword.remove(text) return removed.split(' ')
def remove_stop_word(self, X): factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() for i in range(X.shape[0]): X[i] = stopword.remove(X[i]) if len(X[i]) == 0: X[i] = '' return X
def remove_stopword(self): # 1remove stop word factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() self.__sentence = [ stopword.remove(word) if word != self.word else word for word in self.__sentence ]
class Test_StopWordRemoverFactoryTest(unittest.TestCase): def setUp(self): self.factory = StopWordRemoverFactory() return super(Test_StopWordRemoverFactoryTest, self).setUp() def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover)
def main(): usrinp = int(input("1. Get data, 2. Crime Word Cloud, :")) if usrinp == 1: consumer_key = 'fLnf3WIuilQI8XDAy36L4HCXp' consumer_secret = 'Qi2rtmYVSm1t6ATC7J4McALCHgXHGLUnWgoHHBvU4q9JQraiCv' access_token = '255792021-VQMmhleyYXcLGXXFXcf3cGwkM0FCKOKpwBpxe6fb' access_secret = '1RpbrPaaSd92x8dNt0wupVNx0MsyGqSxsfqRmIu44XcSN' auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth) portal = [ 'detikcom', 'kompascom', 'CNNIndonesia', 'Metro_TV', 'Beritasatu', 'liputan6dotcom', 'SINDOnews', ] crime_text = [ 'begal', 'hacker', 'korupsi', 'tipu', 'koruptor', 'tewas', 'leceh', 'seksual', 'curi', 'tembak', 'curi', 'sabu', 'ganja', 'narkoba' ] text_file = open('tweets.txt', 'w') for i in portal: stuff = api.user_timeline(screen_name=i, count=10000, include_rts=True) for j in stuff: # Process a single status # text_file.write(json.dumps(j._json)) word = j.text.lower() if any(ext in word for ext in crime_text): text_file.write(j.text) text_file.write("\n") text_file.close() elif usrinp == 2: # wc_text_file = open('tweets.txt').readlines() wc_text_file = open('crime_news.txt').readlines() factory_stem = StemmerFactory() stemmer = factory_stem.create_stemmer() factory_stop = StopWordRemoverFactory() stopword = factory_stop.create_stop_word_remover() for i in wc_text_file: output_stem = stemmer.stem(i) output_stop = stopword.remove(output_stem) print(output_stop)
def debug(): if 'admin' not in session: return redirect(url_for("index")) csv = pd.read_excel("dataset.xlsx") factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() print(stopword.remove("Jika untuk aku anak testing")) return "lalala"
def process(txtString): factory = StopWordRemoverFactory() stopwords = factory.create_stop_word_remover() remove_digits = str.maketrans('', '', digits) doc = txtString.translate(remove_digits) doc = re.sub('[^A-Za-z0-9]+', ' ', doc) doc = doc.lower() return stopwords.remove(doc)
def removeStopWord(self): stopWordFactory = StopWordRemoverFactory() stopWord = stopWordFactory.create_stop_word_remover() for channel in self.corpus: text = self.corpus[channel] result = stopWord.remove(text) self.corpus[channel] = result return self
def debug(): if 'admin' not in session: return redirect(url_for("index")) csv = pd.read_excel("dataset.xlsx") factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() print(stopword.remove("Kepala otak kau lek, medan aman atau berkah?")) return "sarjana kombur"
def word_features(self, words): words = re.sub(r'(<.+>)|\W', ' ', words) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() factory = StemmerFactory() stemmer = factory.create_stemmer() stop = stopword.remove(words) words = stemmer.stem(stop) return dict([(word.lower(), True) for word in words.split()])
def transform(self, X): tweet_final = [] for tweet in X: tweet = tweet[0] tweet = tweet.lower() regex = re.compile('\shttp.+\s') tweet = regex.sub('', tweet) regex = re.compile('\shttps.+\s') tweet = regex.sub('', tweet) regex = re.compile('\spic.+\s') tweet = regex.sub('', tweet) regex = re.compile('\sftp.+\s') tweet = regex.sub('', tweet) regex = re.compile('[^a-zA-Z0-9]') tweet = regex.sub(' ', tweet) regex = re.compile('[0-9]+') tweet = regex.sub('', tweet) regex = re.compile(r'\W*\b\w{1,3}\b') tweet = regex.sub('', tweet) regex = re.compile('rt\s') tweet = regex.sub(' ', tweet) replacement_words_list = [ line.rstrip('\n').rstrip('\r') for line in open('replacement_word_list.txt') ] replacement_words = {} for replacement_word in replacement_words_list: replacement_words[replacement_word.split(',') [0]] = replacement_word.split(',')[1] new_string = [] for word in tweet.split(): if replacement_words.get(word, None) is not None: word = replacement_words[word] new_string.append(word) tweet = ' '.join(new_string) #stemming stem_factory = StemmerFactory() stemmer = stem_factory.create_stemmer() tweet = stemmer.stem(tweet) #remove stopwords stopword_factory = StopWordRemoverFactory() stopword = stopword_factory.create_stop_word_remover() tweet = stopword.remove(tweet) tweet_final.append(tweet) tweet_final = np.array(tweet_final) return tweet_final
def cleanStopWord(sentence): factory = StopWordRemoverFactory() ignore = ['lah', 'eh', 'ini', 'itu', 'loh'] # stop words tambahan stopword = factory.create_stop_word_remover() # bersihin stop words, tanda baca dan ubah jadi lowercase semua stop = stopword.remove(sentence) words = re.sub("[^\w]", " ", stop).split() #bersihin tanda baca cleaned_text = [w.lower() for w in words if w not in ignore] #jadiin huruf kecil # final_string = ' '.join(cleaned_text) return cleaned_text
def preprosesing(txt): SWfactory = StopWordRemoverFactory() stopword = SWfactory.create_stop_word_remover() Sfactory = StemmerFactory() stemmer = Sfactory.create_stemmer() hasil = '' for i in txt.split(): if i.isalpha(): # Menghilangkan Kata tidak penting stop = stopword.remove(i) stem = stemmer.stem(stop) hasil += stem + ' ' return hasil
def preprosesing(txt): # Menghilangkan Kata tidak penting SWfactory = StopWordRemoverFactory() stopword = SWfactory.create_stop_word_remover() stop = stopword.remove(txt) #print (stop) #Stemming/Kata dasar Sfactory = StemmerFactory() stemmer = Sfactory.create_stemmer() stem = stemmer.stem(stop) return stem
def preprocessingtext(text): factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() #### MELAKUKAN PROSES STEMMING STOPWORD BAHASA INDONESIA satu = stopword.remove(text) #### MENGHILANGKAN TEXT TIDAK PENTING SEPERTI HASHTAG DAN MENTION dua = re.sub(r"@[^\s]+", " ", satu) dua = re.sub(r"#[^\s]+", " ", dua) dua = re.sub(r"\.", " ", dua) dua = re.sub(r"http[^\s]+", " ", dua) dua = re.sub(r"\?", " ", dua) dua = re.sub(r",", " ", dua) dua = re.sub(r"”", " ", dua) dua = re.sub(r"co/[^\s]+", " ", dua) dua = re.sub(r":'\)", " ", dua) dua = re.sub(r":\)", "", dua) dua = re.sub(r"&", " ", dua) dua = re.sub(r'\"([^\"]+)\"', "\g<1>", dua) dua = re.sub(r'\([^\)]+\"', "", dua) dua = re.sub(r'\((.+)\)', "\g<1>", dua) dua = re.sub(r'-', " ", dua) dua = re.sub(r':\(', " ", dua) dua = re.sub(r':', " ", dua) dua = re.sub(r'\(', " ", dua) dua = re.sub(r'\)', " ", dua) dua = re.sub(r"'", " ", dua) dua = re.sub(r'"', " ", dua) dua = re.sub(r';', " ", dua) dua = re.sub(r':v', " ", dua) dua = re.sub(r'²', " ", dua) dua = re.sub(r':"\)', " ", dua) dua = re.sub(r'\[\]', " ", dua) dua = re.sub(r'“', "", dua) dua = re.sub(r'_', " ", dua) dua = re.sub(r'—', " ", dua) dua = re.sub(r'…', " ", dua) dua = re.sub(r'=', " ", dua) dua = re.sub(r'\/', " ", dua) dua = re.sub(r'\[\w+\]', " ", dua) dua = re.sub(r'!', " ", dua) dua = re.sub(r"'", " ", dua) dua = re.sub(r'\s+', " ", dua) dua = re.sub(r'^RT', "", dua) dua = re.sub(r'\s+$', "", dua) dua = re.sub(r'^\s+', "", dua) #### MENGUBAH CASE KATA MENJADI LOWERCASE tiga = dua.lower() tiga = re.sub(r"\\[^\s]+", " ", tiga) return tiga