def normalisasi(self, dokumen): store = [] for artikel in dokumen: split = artikel.split() split.sort() store.append(split) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() store_n = [] for x, sentence in enumerate(store): store_n.append([]) for word in sentence: plus = stopword.remove(word) has = self.hasNumbers(plus) if plus != '' and \ has == False and \ '-' not in plus and \ len(plus) <= 13 and \ len(plus) >= 3: store_n[x].append(plus) for i, kalimat in enumerate(store_n): for j, kata in enumerate(kalimat): if kata == '': del store_n[i][j] return store_n
def stopwords_removal(self, text, stopwords, output_stopwords): with open(dataOutputPath, encoding='utf-8') as f: text = f.read() f.close() with open(stopwords, encoding='utf-8') as f: list_stopwords = f.read() f.close() stop_factory = StopWordRemoverFactory() more_stopwords = list_stopwords.split("\n") #Tambahkan Stopword Baru data = stop_factory.get_stop_words() + more_stopwords stopword = stop_factory.create_stop_word_remover() remove_stopwords = stopword.remove(text) with open(pathStopwords, 'w', encoding='utf-8') as f: f.write(remove_stopwords) f.close() print( "Stopwords Removal success!\nCount Words Frequency on process...") return remove_stopwords
def transform(self, X): tweet_final = [] for tweet in X: tweet = tweet[0] tweet = tweet.lower() regex = re.compile('\shttp.+\s') tweet = regex.sub('', tweet) regex = re.compile('\shttps.+\s') tweet = regex.sub('', tweet) regex = re.compile('\spic.+\s') tweet = regex.sub('', tweet) regex = re.compile('\sftp.+\s') tweet = regex.sub('', tweet) regex = re.compile('[^a-zA-Z0-9]') tweet = regex.sub(' ', tweet) regex = re.compile('[0-9]+') tweet = regex.sub('', tweet) regex = re.compile(r'\W*\b\w{1,3}\b') tweet = regex.sub('', tweet) regex = re.compile('rt\s') tweet = regex.sub(' ', tweet) #remove stopwords stop_factory = StopWordRemoverFactory() stopword = stop_factory.create_stop_word_remover() tweet = stopword.remove(tweet) #stemming stem_factory = StemmerFactory() stemmer = stem_factory.create_stemmer() tweet = stemmer.stem(tweet) tweet_final.append(tweet) tweet_final = np.array(tweet_final) return tweet_final
def clean_data(self, path): path = str(path) bye = open('ujaranbaru.txt', 'w+') with open(str(path), 'r') as readFile: reader = csv.reader(readFile) lines = list(reader) for i in range(len(lines)): str1 = ''.join(lines[i]) caseF = str1.casefold() Runame = re.sub('@[^\s]+', '', caseF) Rhashtag = re.sub('#[^\s]+', '', Runame) CleanNumber = ''.join([i for i in Rhashtag if not i.isdigit()]) line = re.sub('[(),\'.!$]', '', CleanNumber) link = re.sub('https[^\s]+', '', line) garing = re.sub('\\\[^\s]+', '', link) removeRT = garing.replace("rt", "") removespace = removeRT.lstrip() factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() stopw = stopword.remove(removespace) factory = StemmerFactory() stemmer = factory.create_stemmer() steam = stemmer.stem(stopw) text = steam.split() if (len(text) >= 5): bye.write(steam + '\n') self.progressBar_6.setValue((i + 1) / len(lines) * 100) bye.close()
def preprocessing_text(text): encoded_string = text.encode("ascii", "ignore") #remove asci text = encoded_string.decode() #remove asci text = re.sub(r'http\S+', '', text) #remove url text = text.lower() #lowercase text = ''.join([i for i in text if not i.isdigit()]) #remove number #text = ''.join([i for i in text if i not in text.punctuation]) #text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*“”‘’_~+=|\t\n''' for char in text: if char in punctuations: text = text.replace(char, "") factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) return text
def predict(): ''' For rendering results on HTML GUI ''' model = pickle.load(open('model_news.pkl', 'rb')) cv_ = pickle.load(open('news_cv.pkl', 'rb')) enc_ = pickle.load(open('enc_news.pkl', 'rb')) y_for_test = request.form['news_'] y_for_test = pd.Series(y_for_test) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() factory = StemmerFactory() stemmer = factory.create_stemmer() xy = [] for i in y_for_test.values: stops_ = stopword.remove(stemmer.stem(i)) wordy_ = '' for st in stops_.split(' '): if st.isalpha(): wordy_ += st + ' ' xy.append(wordy_) x_t = cv_.transform(xy) resu = model.predict(x_t) print('prediction:') s = [str(i) for i in list(enc_.inverse_transform(resu))] res = ", ".join(s) return render_template( 'index.html', prediction_text='Topiknya adalah {}. Ya kan?'.format(res))
def blogging(): # Insert to Blog DB _title = request.form["title"] _content = request.form["content"] _link = request.form["link"] _date = request.form["date"] # Normalizing _title = _title.replace("'", "\"") _content2 = _content.replace("'", "") _content = _content.replace("'", "") # ***PRE-PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _content2 = stopword.remove(_content2) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _content2 = stemmer.stem(_content2) BlogRepository().insert(Blog(_title, _content, _link, _content2, _date)) # Insert to Tf DB return render_template('index.html')
def preprocess(text): token_list = [] vectorizer = TfidfVectorizer() factory = StemmerFactory() stop_factory = StopWordRemoverFactory() ps = factory.create_stemmer() stopword = stop_factory.create_stop_word_remover() pattern = r'[0-9]' text = re.sub(pattern, '', text) shorthand = [" no "," sy "," dg "," kk "," ga "," sdh "," bgn "," klw "," bbrp "," kbr "," dri "," dr "," jgn "," yg "," tdk ", " jt "," gk ", " atw "," klu ", " tsb "," utk "," tlh "] replacement= [" nomor "," saya "," dengan "," kakak "," tidak "," sudah "," bangun "," kalau "," beberapa "," kabar " ," dari "," dari "," jangan "," yang "," tidak "," juta "," tidak "," atau "," kalau "," tersebut "," untuk ", " telah"] for i in range(len(shorthand)): # re.sub(shorthand[i],replacement[i],text) text = text.replace(shorthand[i],replacement[i]) text = stopword.remove(text) text = re.sub(r'\b\w{1,2}\b', '', text) processed_text = ' '.join(ps.stem(token) for token in word_tokenize(text)) token_list.append(processed_text) x = vectorizer.fit_transform(token_list) return processed_text
def __init__(self): with open('./stopwords.txt') as f: more_stopword=f.read().split('\n') SWfactory = StopWordRemoverFactory() stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words()) self.stopword = StopWordRemover(stopword_data)
def Preprocessing(data): print("Preprocessing") cleanData = [] tokenizer = RegexpTokenizer(r'\w+') factory_stopwords = StopWordRemoverFactory() stopwordsFact = factory_stopwords.get_stop_words() stemmer = StemmerFactory().create_stemmer() count = 0 for kalimat in data: removedHttp = re.sub(r"http\S+", '', kalimat) #hilangin link http removedPic = re.sub(r"pic.twitter\S+", '', removedHttp) #hilangin link pic.twitter lower = removedPic.lower() #casefolding tokenized = tokenizer.tokenize(lower) #tokenizer + punctuation removal stopwords = [] #Stopwords removal for kata in tokenized: if kata not in stopwordsFact: stopwords.append(kata) stemmed = [] #stemming for kata in stopwords: #stemming stemmed.append(stemmer.stem(kata)) #stemming cleanData.append(stemmed) count += 1 print(count) return cleanData
def pre_processing(doc): # NLP Text Pre-processing # Regular Expression - Menghapus karakter angka dan tanda baca tokenizer = RegexpTokenizer(r'\w+') texts = [] # Stemmer - Menghilangkan infleksi/kata imbuhan ke dalam bentuk dasar stFactory = StemmerFactory() st = stFactory.create_stemmer() # Stopword - Menghilangkan kata umum yang tidak memiliki makna swFactory = StopWordRemoverFactory() sw = swFactory.create_stop_word_remover() for i in doc: raw = i.lower() # Merubah kata menjadi huruf kecil tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if i in sw.remove(i)] stemmed_tokens = [st.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) # Detokenize detok = [] for row in texts: sq = '' for word in row: sq = sq + ' ' + word detok.append(sq) texts = detok return texts[0]
def modif(kalimat): # nltk.download('punkt') pat = "" kalimat = kalimat.translate(str.maketrans('', '', string.punctuation)).lower() # case folding & menghilangkan tanda baca . , tokens = nltk.tokenize.word_tokenize(kalimat) # tokenization fac = StopWordRemoverFactory() #set stopword stop = fac.get_stop_words() stop.append("kak") #menambahkan "kak" ke dalam kamus stopword stop.remove("tidak") #menghapus kata "tidak" stop.remove("boleh") #menghapus kata "boleh" stop.remove("bisa") #menghapus kata "bisa" stop.remove("dimana") removed = [] for t in tokens: if t not in stop: removed.append(t) #stopword removal pat = "" for w in removed: pat += w + " " return (pat)
class Test_StopWordRemoverFactoryTest(unittest.TestCase): def setUp(self): self.factory = StopWordRemoverFactory() return super(Test_StopWordRemoverFactoryTest, self).setUp() def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) def test_stopwordRemoval(self): sremover = self.factory.create_stop_word_remover() self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) def test_tokens_stopwordRemoval(self): tokens = ['pergi', 'ke', 'sekolah', 'yang', 'bagus', 'adalah', 'impian'] sremover = self.factory.create_stop_word_remover() clean_tokens = sremover.remove_tokens(tokens) text = ' '.join(clean_tokens) self.assertEquals('pergi sekolah bagus impian', text) self.assertEqual('pergi', clean_tokens[0]) self.assertEqual('sekolah', clean_tokens[1]) self.assertEqual('bagus', clean_tokens[2]) self.assertEqual('impian', clean_tokens[3]) def test_execution_time(self): start = time.time() sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' sremover = self.factory.create_stop_word_remover() sremover.remove(sentence) end = time.time() # print(execution_time) execution_time = end - start self.assertTrue(execution_time < 1)
def preprocessing_text(text): encoded_string = text.encode("ascii", "ignore") #remove asci text = encoded_string.decode() #remove asci text = text.lower() #lowercase text = ''.join([i for i in text if not i.isdigit()]) #remove number #text = ''.join([i for i in text if i not in text.punctuation]) text = re.sub(r'http\S+', '', text) #remove url factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) text = re.sub("[^\w\s]", '', text) #remove punctuation text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation return text
def run(self): i = 1 status = True while status: df = pd.read_csv('data.csv', sep=",") data = list(df["indonesia"].astype(str).str.lower()) kd = [] i = 1 for d in data: # StopWordRemover factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() # Tokenize stop = nltk.tokenize.word_tokenize(stopword.remove(str(d))) # Stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() katadasar = stemmer.stem(str(stop)) kd.append(katadasar) self.update_progressbar.emit(i + 1) i = i + 1 no = 0 with open('post-preprocessing.csv', 'w', newline='', encoding='utf-8-sig') as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow(["teks"]) for d in kd: spamwriter.writerow([kd[no]]) no = no + 1 status = False
def preproses(request): baca_db = CrawlDetikNews.objects.all() kounter = 0 for baca in baca_db: kounter += 1 if kounter > 497 and kounter <= 500: # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # stemming process sentence = baca.headline + " " + baca.content output = stemmer.stem(sentence) baca.stemming = output # ------------------- Stopword Removal fa = StopWordRemoverFactory() stopword = fa.create_stop_word_remover() kalimat = output stop = stopword.remove(kalimat) stop = stop.replace(' - ', ' ') output = stop baca.stopword = output baca.save() return render(request, 'beranda/preprocessing.html', { "rootword": output, "ori": sentence })
def search(): if request.method == 'GET': data = data = json.load(open('data/testing_data.json', encoding="utf-8")) algo = request.args.get('algo', '1') query1 = request.args.get('q1', '') query2 = request.args.get('q2', '') query3 = request.args.get('q3', '') max_response = request.args.get("max_resp", 10) start = time.time() print("timer start") queries = query1 + " " + query2 + " " + query3 #remove stopwords factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() stopword.remove(queries) #stemming factory = StemmerFactory() stemmer = factory.create_stemmer() queries = stemmer.stem(queries) if algo == '1': response = jsonify(tfidf(data, queries, max_response)) else: response = jsonify(lsa(data, queries, max_response)) end = time.time() print("timer stop. Runtime: ") print(end - start) return response
def stopword(self): factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() filtering = [] for artikel in self.dokumen: plus = stopword.remove(artikel.isi) filtering.append(plus) return filtering
def remove_stop_word(self, X): factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() for i in range(X.shape[0]): X[i] = stopword.remove(X[i]) if len(X[i]) == 0: X[i] = '' return X
def stopword(self): stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya'] data = stop_factory + more_stopword stop_factory = StopWordRemoverFactory() dictionary = ArrayDictionary(data) self.stopword = StopWordRemover(dictionary)
def StopwordRemoval(self, text): factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = ' '.join(text) removed = stopword.remove(text) return removed.split(' ')
def __init__(self, title: str, plot: str, human_synopsis: str): self.title = title self.plot = plot self.human_synopsis = human_synopsis self.stopwords = StopWordRemoverFactory().create_stop_word_remover() self.stemmer = StemmerFactory().create_stemmer() self.ringkasan = ""
def remove_stopword(self): # 1remove stop word factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() self.__sentence = [ stopword.remove(word) if word != self.word else word for word in self.__sentence ]
def generate_sastrawi_stopwords(): # get Sastrawi stopwords as list factory = StopWordRemoverFactory() stopwords = factory.get_stop_words() # write to txt file with open(stopwords_list_path + '/sastrawi-stopwords.txt', 'w') as file: for word in stopwords: file.write(word + "\n")
def word_tokenizer(text): #tokenizes and stems the text tokens = word_tokenize(text) fac2 = StemmerFactory() stemmer = fac2.create_stemmer() factory = StopWordRemoverFactory() tokens = [stemmer.stem(t) for t in tokens if t not in factory.get_stop_words()] return tokens
def main(): usrinp = int(input("1. Get data, 2. Crime Word Cloud, :")) if usrinp == 1: consumer_key = 'fLnf3WIuilQI8XDAy36L4HCXp' consumer_secret = 'Qi2rtmYVSm1t6ATC7J4McALCHgXHGLUnWgoHHBvU4q9JQraiCv' access_token = '255792021-VQMmhleyYXcLGXXFXcf3cGwkM0FCKOKpwBpxe6fb' access_secret = '1RpbrPaaSd92x8dNt0wupVNx0MsyGqSxsfqRmIu44XcSN' auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth) portal = [ 'detikcom', 'kompascom', 'CNNIndonesia', 'Metro_TV', 'Beritasatu', 'liputan6dotcom', 'SINDOnews', ] crime_text = [ 'begal', 'hacker', 'korupsi', 'tipu', 'koruptor', 'tewas', 'leceh', 'seksual', 'curi', 'tembak', 'curi', 'sabu', 'ganja', 'narkoba' ] text_file = open('tweets.txt', 'w') for i in portal: stuff = api.user_timeline(screen_name=i, count=10000, include_rts=True) for j in stuff: # Process a single status # text_file.write(json.dumps(j._json)) word = j.text.lower() if any(ext in word for ext in crime_text): text_file.write(j.text) text_file.write("\n") text_file.close() elif usrinp == 2: # wc_text_file = open('tweets.txt').readlines() wc_text_file = open('crime_news.txt').readlines() factory_stem = StemmerFactory() stemmer = factory_stem.create_stemmer() factory_stop = StopWordRemoverFactory() stopword = factory_stop.create_stop_word_remover() for i in wc_text_file: output_stem = stemmer.stem(i) output_stop = stopword.remove(output_stem) print(output_stop)
def process(txtString): factory = StopWordRemoverFactory() stopwords = factory.create_stop_word_remover() remove_digits = str.maketrans('', '', digits) doc = txtString.translate(remove_digits) doc = re.sub('[^A-Za-z0-9]+', ' ', doc) doc = doc.lower() return stopwords.remove(doc)
def Stopword_removal(sentence): stopword_factory = StopWordRemoverFactory() stopwords = stopword_factory.get_stop_words() words = sentence.split() output = "" for word in words: if word not in stopwords: output = output + " " + word return output
def debug(): if 'admin' not in session: return redirect(url_for("index")) csv = pd.read_excel("dataset.xlsx") factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() print(stopword.remove("Kepala otak kau lek, medan aman atau berkah?")) return "sarjana kombur"
def __init__(self, min_cut=0.1, max_cut=0.9): """ Initilize the text summarizer. Words that have a frequency term lower than min_cut or higer than max_cut will be ignored. """ factory = StopWordRemoverFactory() self._min_cut = min_cut self._max_cut = max_cut self._stopwords = set(factory.get_stop_words() + list(punctuation))
class Test_StopWordRemoverFactoryTest(unittest.TestCase): def setUp(self): self.factory = StopWordRemoverFactory() return super(Test_StopWordRemoverFactoryTest, self).setUp() def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover)
def setUp(self): self.factory = StopWordRemoverFactory() return super(Test_StopWordRemoverFactoryTest, self).setUp()