def clean_data(self, path): path = str(path) bye = open('ujaranbaru.txt', 'w+') with open(str(path), 'r') as readFile: reader = csv.reader(readFile) lines = list(reader) for i in range(len(lines)): str1 = ''.join(lines[i]) caseF = str1.casefold() Runame = re.sub('@[^\s]+', '', caseF) Rhashtag = re.sub('#[^\s]+', '', Runame) CleanNumber = ''.join([i for i in Rhashtag if not i.isdigit()]) line = re.sub('[(),\'.!$]', '', CleanNumber) link = re.sub('https[^\s]+', '', line) garing = re.sub('\\\[^\s]+', '', link) removeRT = garing.replace("rt", "") removespace = removeRT.lstrip() factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() stopw = stopword.remove(removespace) factory = StemmerFactory() stemmer = factory.create_stemmer() steam = stemmer.stem(stopw) text = steam.split() if (len(text) >= 5): bye.write(steam + '\n') self.progressBar_6.setValue((i + 1) / len(lines) * 100) bye.close()
def search(): if request.method == 'GET': data = data = json.load(open('data/testing_data.json', encoding="utf-8")) algo = request.args.get('algo', '1') query1 = request.args.get('q1', '') query2 = request.args.get('q2', '') query3 = request.args.get('q3', '') max_response = request.args.get("max_resp", 10) start = time.time() print("timer start") queries = query1 + " " + query2 + " " + query3 #remove stopwords factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() stopword.remove(queries) #stemming factory = StemmerFactory() stemmer = factory.create_stemmer() queries = stemmer.stem(queries) if algo == '1': response = jsonify(tfidf(data, queries, max_response)) else: response = jsonify(lsa(data, queries, max_response)) end = time.time() print("timer stop. Runtime: ") print(end - start) return response
def predict(): ''' For rendering results on HTML GUI ''' model = pickle.load(open('model_news.pkl', 'rb')) cv_ = pickle.load(open('news_cv.pkl', 'rb')) enc_ = pickle.load(open('enc_news.pkl', 'rb')) y_for_test = request.form['news_'] y_for_test = pd.Series(y_for_test) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() factory = StemmerFactory() stemmer = factory.create_stemmer() xy = [] for i in y_for_test.values: stops_ = stopword.remove(stemmer.stem(i)) wordy_ = '' for st in stops_.split(' '): if st.isalpha(): wordy_ += st + ' ' xy.append(wordy_) x_t = cv_.transform(xy) resu = model.predict(x_t) print('prediction:') s = [str(i) for i in list(enc_.inverse_transform(resu))] res = ", ".join(s) return render_template( 'index.html', prediction_text='Topiknya adalah {}. Ya kan?'.format(res))
def run(self): i = 1 status = True while status: df = pd.read_csv('data.csv', sep=",") data = list(df["indonesia"].astype(str).str.lower()) kd = [] i = 1 for d in data: # StopWordRemover factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() # Tokenize stop = nltk.tokenize.word_tokenize(stopword.remove(str(d))) # Stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() katadasar = stemmer.stem(str(stop)) kd.append(katadasar) self.update_progressbar.emit(i + 1) i = i + 1 no = 0 with open('post-preprocessing.csv', 'w', newline='', encoding='utf-8-sig') as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow(["teks"]) for d in kd: spamwriter.writerow([kd[no]]) no = no + 1 status = False
def preprocessing_text(text): encoded_string = text.encode("ascii", "ignore") #remove asci text = encoded_string.decode() #remove asci text = text.lower() #lowercase text = ''.join([i for i in text if not i.isdigit()]) #remove number #text = ''.join([i for i in text if i not in text.punctuation]) text = re.sub(r'http\S+', '', text) #remove url factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) text = re.sub("[^\w\s]", '', text) #remove punctuation text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation return text
def blogging(): # Insert to Blog DB _title = request.form["title"] _content = request.form["content"] _link = request.form["link"] _date = request.form["date"] # Normalizing _title = _title.replace("'", "\"") _content2 = _content.replace("'", "") _content = _content.replace("'", "") # ***PRE-PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _content2 = stopword.remove(_content2) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _content2 = stemmer.stem(_content2) BlogRepository().insert(Blog(_title, _content, _link, _content2, _date)) # Insert to Tf DB return render_template('index.html')
def preprocessing_text(text): encoded_string = text.encode("ascii", "ignore") #remove asci text = encoded_string.decode() #remove asci text = re.sub(r'http\S+', '', text) #remove url text = text.lower() #lowercase text = ''.join([i for i in text if not i.isdigit()]) #remove number #text = ''.join([i for i in text if i not in text.punctuation]) #text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*“”‘’_~+=|\t\n''' for char in text: if char in punctuations: text = text.replace(char, "") factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() text = stopword.remove(text) factory = StemmerFactory() stemmer = factory.create_stemmer() text = stemmer.stem(text) return text
def word_features(self, words): words = re.sub(r'(<.+>)|\W', ' ', words) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() factory = StemmerFactory() stemmer = factory.create_stemmer() stop = stopword.remove(words) words = stemmer.stem(stop) return dict([(word.lower(), True) for word in words.split()])
def cleanTweets(Tweets): factory = StopWordRemoverFactory(); stopwords = set(factory.get_stop_words()+['twitter','rt','pic','com','yg','ga','https']) factory = StemmerFactory(); stemmer = factory.create_stemmer() for i,tweet in enumerate(tqdm(Tweets)): txt = tweet['fullTxt'] # if you want to ignore retweets ==> if not re.match(r'^RT.*', txt): txt = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',txt)# clean urls txt = txt.lower() # Lowercase txt = Tokenizer.tokenize(txt) symbols = set(['@']) # Add more if you want txt = [strip_non_ascii(t,symbols) for t in txt] #remove all non ASCII characters txt = ' '.join([t for t in txt if len(t)>1]) Tweets[i]['cleanTxt'] = txt # this is not a good Python practice, only for learning. txt = stemmer.stem(txt).split() Tweets[i]['nlp'] = ' '.join([t for t in txt if t not in stopwords]) return Tweets
def process(text): # Normalizing _query = text.replace("'", "") # ***PRE-PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _query = stopword.remove(_query) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _query = stemmer.stem(_query) return _query
def blogging(): # Insert to Blog DB _question = request.form["question"] _answer = request.form["answer"] faq = FAQRepository().insert(Faq(0, _question, _answer)) # Normalizing _question = _question.replace("'", "") _answer = _answer.replace("'", "") # ***PRE-PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _question = stopword.remove(_question) _answer = stopword.remove(_answer) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _question = stemmer.stem(_question) _answer = stemmer.stem(_answer) # Get all unique word from question blob = tb(_question) uniqWord = list(set(blob.words)) # Count all unique word in question sumOfWord = 0 for word in uniqWord: _n = blob.words.count(word) sumOfWord += _n # Get Average average = sumOfWord / len(blob) # Get Over Average Word for word in uniqWord: n = blob.words.count(word) if (n > average): # Insert to Keyword DB KeywordRepository().insert(Keyword(faq.id_faq, word, n)) return render_template('faq.html')
# import StemmerFactory class from Sastrawi.Stemmer.StemmerFactory import StemmerFactory # import StopWordRemoverFactory class from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory import re, csv factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() factory = StemmerFactory() stemmer = factory.create_stemmer() num_lines = sum(1 for line in open( 'c:/xampp/htdocs/efasonline/python/training/training_teknologi.csv')) file = [[0 for x in range(5)] for y in range(num_lines)] #save file savemyFile = open( 'c:/xampp/htdocs/efasonline/python/training/training_preprocess_teknologi.csv', 'w', newline='') # with savemyFile: writer = csv.writer(savemyFile, delimiter=';', lineterminator='\r\n', quoting=csv.QUOTE_ALL) with open('c:/xampp/htdocs/efasonline/python/training/training_teknologi.csv', newline='') as myFile: reader = csv.reader(myFile, delimiter=';', quoting=csv.QUOTE_ALL)
def run(self): i = 1 status = True while status: df = pd.read_csv('data.csv', sep=",") dataAfter = pd.read_csv('post-preprocessing.csv', sep="|") data = self.query.split(" ") # Query Kata Dasar querykd = [] for d in data: # StopWordRemover factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() # Tokenize stop = nltk.tokenize.word_tokenize(stopword.remove(str(d.lower()))) # Stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() katadasar = stemmer.stem(str(stop)) querykd.append(katadasar) # print(str(i)+ str(','), end='') # print(d) # i=i+1 # TF - DF termFrequency = [] dokumenfrequency = [] for index, term in enumerate(querykd): countDokumen = [] countDokumenFrequency = 0 for dokumen in dataAfter['teks']: count = 0 for kata in dokumen.split(' '): if kata == term: count += 1 countDokumen.append(count) if count > 0: countDokumenFrequency += 1 termFrequency.append(countDokumen) dokumenfrequency.append(countDokumenFrequency) # IDF + 1 idfSatu = [] jumlahDokumen = len(df) for i in dokumenfrequency: idfSatu.append(log(jumlahDokumen / (i + 1))) # Pembobotan TFIDF (bisa di ganti COSIM) weight = termFrequency[:] for i in range(len(weight)): for j in range(len(weight[i])): weight[i][j] = termFrequency[i][j] * idfSatu[i] # Menghitung total bobot dokumen jumlahWeight = [] for i in range(len(weight[0])): jumlahWeight.append([i, 0]) for i in range(len(weight)): for j in range(len(weight[i])): jumlahWeight[j][1] += weight[i][j] # sorting bobot dokumen tertinggi = relevan JumlahWeight = sorted(jumlahWeight, key=itemgetter(1), reverse=True) for i in range(0, 20): print(str(i + 1) + str('. ') + str(df['judul'][JumlahWeight[i][0]]) + " | " + str(JumlahWeight[i][1])) status = False
def cosine(): rank = [] score = [] query = request.form["query"] # ***PRE PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _query = stopword.remove(query) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _query = stemmer.stem(_query) # TF Query listTf = [] blob = tb(_query) uniqWord = list(set(blob.words)) # Desc : GET BLOG TITLE WHICH THE CONTENT CONTAINT QUERY WORD listTitleBlog = [] for word in uniqWord: try: blogList = BlogRepository().getByWord(word) for t in blogList: listTitleBlog.append(t["title"]) except: print(word, "not available") if (len(listTitleBlog) == 0): return render_template("result.html", rank=rank, query=query) listTitleBlog = list(set(listTitleBlog)) #Unique Blog Title blogAll = [] for l in listTitleBlog: blogAll.append(BlogRepository().getByTitle(l)) # *** COSINE SIMILIARITY *** # Scoring for blog in blogAll: # Get Set of Article and Query combined = _query + blog["tf"] blob = tb(combined) uniqWord = list(set(blob.words)) # Count on two array how many word over there bQuery = tb(_query) bBlog = tb(blog["tf"]) cQuery = [] cBlog = [] for word in uniqWord: _nQ = bQuery.words.count(word) cQuery.append(_nQ) _nB = bBlog.words.count(word) cBlog.append(_nB) # print(blog["title"]) # print(cQuery) # print(cBlog) result = 1 - spatial.distance.cosine(cQuery, cBlog) score.append(result) lenScore = len(score) while (lenScore > 0): bestIndex = score.index(max(score)) rank.append(blogAll[bestIndex]) del score[bestIndex] del blogAll[bestIndex] lenScore = len(score) return render_template("result.html", rank=rank, query=query)
def result_blog(): score = [] rank = [] query = request.form["query"] # ***PRE PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _query = stopword.remove(query) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _query = stemmer.stem(_query) # TF Query listTf = [] blob = tb(_query) uniqWord = list(set(blob.words)) for word in uniqWord: _n = blob.words.count(word) listTf.append(Tf("query", word, _n)) # *** SCORING PROCESS *** # TF # Desc : GET BLOG TITLE WHICH THE CONTENT CONTAINT QUERY WORD listTitleBlog = [] for word in uniqWord: try: blogList = BlogRepository().getByWord(word) for t in blogList: listTitleBlog.append(t["title"]) except: print(word, "not available") if (len(listTitleBlog) == 0): return render_template("result.html", rank=rank, query=query) listTitleBlog = list(set(listTitleBlog)) #Unique Blog Title listBlog = [] for l in listTitleBlog: listBlog.append(BlogRepository().getByTitle(l)) # IDF blobList = [] for blog in listBlog: content = blog["content"] blobList.append(tb(content)) idfList = [] for word in uniqWord: idfList.append(idf(word, blobList)) # Scoring for title in listTitleBlog: result = 0 for i, word in enumerate(uniqWord): try: # if word available # Counting Word blogData = BlogRepository().getByTitle(title) _content = blogData["tf"] blob = tb(_content) _n = blob.words.count(word) result = result + (idfList[i] * _n) except: print(word, "not available") score.append(result) lenScore = len(score) while (lenScore > 0): bestIndex = score.index(max(score)) rank.append(listBlog[bestIndex]) del score[bestIndex] del listBlog[bestIndex] lenScore = len(score) # print(rank) return render_template("result.html", rank=rank, query=query)
def scrappingData(request, data, jmlDataScrapping): search = data jumlahData = int(jmlDataScrapping) print(data) nltk.download('punkt') nltk.download('stopwords') # %matplotlib inline RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) chrome_path = r"C:\Users\Rifqi Rosidin\Documents\za\chromedriver_win32\chromedriver.exe" driver = webdriver.Chrome(chrome_path) driver.get('https://play.google.com/store/search?q=' + search + '&c=apps' + '&hl=in') tes = driver.find_element_by_xpath( "//*[@id='fcxH9b']/div[4]/c-wiz/div/div[2]/div/c-wiz/c-wiz[1]/c-wiz/div/div[2]/div[1]/c-wiz/div/div/div[1]/div/div/a" ) tes.click() time.sleep(5) tes1 = driver.find_element_by_xpath( "//*[@id='fcxH9b']/div[4]/c-wiz[2]/div/div[2]/div/div[1]/div/div/div[1]/div[6]/div/span/span" ) tes1.click() time.sleep(4) count = 1 i = 1 while i < 5: try: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) if ((i % 5) == 0): driver.execute_script('window.scrollTo(1, 2000);') time.sleep(2) tes2 = driver.find_element_by_xpath( "//*[@id='fcxH9b']/div[4]/c-wiz[3]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span" ) tes2.click() print("scroll ke -" + str(count)) i += 1 count += 1 except: print("skip scrol") i += 1 count += 1 print('udah scrolling') a = 'test1' b = 1 c = [] b = 1 d = 0 errorNumber = 0 driver.execute_script('window.scrollTo(1, 10);') while a != 'test': d = 2 try: tes3 = driver.find_element_by_xpath( "//*[@id='fcxH9b']/div[4]/c-wiz[3]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[1]/div[" + str(b) + "]/div/div[2]/div[2]/span[1]/div/button") tes3.click() except NoSuchElementException: d = 1 try: tes4 = driver.find_element_by_xpath( "/html/body/div[1]/div[4]/c-wiz[3]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div[" + str(b) + "]/div/div[2]/div[2]/span[" + str(d) + "]") # print(str(b) + tes4.text) print("review ke - " + str(b)) c.append(tes4.text) if (int(b) >= jumlahData): a = 'test' b += 1 errorNumber += 1 except: print(jumlahData) errorNumber += 1 if (int(errorNumber) >= jumlahData): a = 'test' b += 1 #akhir tahap scrape data------------------------------------ print(len(c)) # hapus komentar data = pd.DataFrame({"ulasan": c}) df = data['ulasan'] ulasan = [] x = 0 y = '' for i in df: emoji = i.replace('emoji', '') if emoji.isspace(): ulasan.append(x) else: y = 'tess' x += 1 komentar = data.drop(ulasan) print("--------------hapus Emoji-------") print(komentar) #tahap melakukan case folding dan angka dan whitespace() case = [] for i in komentar['ulasan']: b = re.sub(r'', '', str(i)) a = b.lower() #menjadikan huruf kecil c = re.sub(r'[0-9]+', '', a) #menghilangkan angka d = c.strip() #menghapus whitecase e = d.translate(str.maketrans( "", "", string.punctuation)) #menghilangkan karakter case.append(e) komentar['ulasan'] = case print("\n") print("-------case folding dan angka dan whirespace---------") print(komentar) token = [] for i in komentar['ulasan']: tokens = nltk.tokenize.word_tokenize(str(i)) token.append(tokens) token #akhir tahap pemisahan teks menjadi potongan-potongan def listToString(s): # initialize an empty string str1 = "" # traverse in the string for i in s: str1 += str(i) # return string return str1 kata = listToString(token) #add stopword print("-------pemisahan teks menjadi potongan-potongan ---------") print(kata) #add stopword stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = ['yg', 'tp'] #menambahkan stopword print(stopwords) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() word = [] for i in komentar['ulasan']: stop = stopword.remove(str(i)) tokens = nltk.tokenize.word_tokenize(stop) word = stop_factory + more_stopword word.append(tokens) # word #akhir add stopword #menjadikan kata ke bentuk dasarnya factory = StemmerFactory() stemmer = factory.create_stemmer() Hasil = [] for i in komentar['ulasan']: hasil = stemmer.stem(str(i)) Hasil.append(hasil) Hasil kata = listToString(Hasil) print("------PROSES LABELING--------") config = dict() config["negation"] = True config["booster"] = True config["ungkapan"] = True config["consecutive"] = True config["repeated"] = True config["emoticon"] = True config["question"] = True config["exclamation"] = True config["punctuation"] = True senti = label.sentistrength(config) print(len(Hasil)) dt = pd.DataFrame({"ulasan": Hasil}) dt.head(len(Hasil)) sentim = [] for i in dt['ulasan']: x = senti.main(i) sentim.append(x['kelas']) dt['label'] = sentim X = komentar['ulasan'].values y = dt['label'].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_SEED) MNB = MultinomialNaiveBayes(classes=np.unique(y), tokenizer=Tokenizer()).fit( X_train, y_train) #akurasi algoritma y_hat = MNB.predict(X_test) akurasi = accuracy_score(y_test, y_hat) print("akurasi") print(accuracy_score(y_test, y_hat)) dt['akurasi'] = akurasi response = dt.to_dict() return response