def clean_data(self, path):
     path = str(path)
     bye = open('ujaranbaru.txt', 'w+')
     with open(str(path), 'r') as readFile:
         reader = csv.reader(readFile)
         lines = list(reader)
         for i in range(len(lines)):
             str1 = ''.join(lines[i])
             caseF = str1.casefold()
             Runame = re.sub('@[^\s]+', '', caseF)
             Rhashtag = re.sub('#[^\s]+', '', Runame)
             CleanNumber = ''.join([i for i in Rhashtag if not i.isdigit()])
             line = re.sub('[(),\'.!$]', '', CleanNumber)
             link = re.sub('https[^\s]+', '', line)
             garing = re.sub('\\\[^\s]+', '', link)
             removeRT = garing.replace("rt", "")
             removespace = removeRT.lstrip()
             factory = StopWordRemoverFactory()
             stopword = factory.create_stop_word_remover()
             stopw = stopword.remove(removespace)
             factory = StemmerFactory()
             stemmer = factory.create_stemmer()
             steam = stemmer.stem(stopw)
             text = steam.split()
             if (len(text) >= 5):
                 bye.write(steam + '\n')
             self.progressBar_6.setValue((i + 1) / len(lines) * 100)
     bye.close()
예제 #2
0
def search():
	if request.method == 'GET':
		data = data = json.load(open('data/testing_data.json', encoding="utf-8"))
		algo = request.args.get('algo', '1')
		query1 = request.args.get('q1', '')
		query2 = request.args.get('q2', '')
		query3 = request.args.get('q3', '')
		max_response = request.args.get("max_resp", 10)
		start = time.time()
		print("timer start")

		queries = query1 + " " + query2 + " " + query3
		#remove stopwords
		factory = StopWordRemoverFactory()
		stopword = factory.create_stop_word_remover()
		stopword.remove(queries)

		#stemming
		factory = StemmerFactory()
		stemmer = factory.create_stemmer()
		queries = stemmer.stem(queries)

		if algo == '1':
			response = jsonify(tfidf(data, queries, max_response))
		else:
			response = jsonify(lsa(data, queries, max_response))
		end = time.time()
		print("timer stop. Runtime: ")
		print(end - start)
		return response
예제 #3
0
파일: app.py 프로젝트: deviyantiam/projects
def predict():
    '''
    For rendering results on HTML GUI
    '''
    model = pickle.load(open('model_news.pkl', 'rb'))
    cv_ = pickle.load(open('news_cv.pkl', 'rb'))
    enc_ = pickle.load(open('enc_news.pkl', 'rb'))
    y_for_test = request.form['news_']
    y_for_test = pd.Series(y_for_test)
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    xy = []
    for i in y_for_test.values:
        stops_ = stopword.remove(stemmer.stem(i))
        wordy_ = ''
        for st in stops_.split(' '):
            if st.isalpha():
                wordy_ += st + ' '
        xy.append(wordy_)
    x_t = cv_.transform(xy)
    resu = model.predict(x_t)
    print('prediction:')
    s = [str(i) for i in list(enc_.inverse_transform(resu))]
    res = ", ".join(s)
    return render_template(
        'index.html',
        prediction_text='Topiknya adalah {}. Ya kan?'.format(res))
예제 #4
0
    def run(self):
        i = 1
        status = True
        while status:
            df = pd.read_csv('data.csv', sep=",")

            data = list(df["indonesia"].astype(str).str.lower())
            kd = []
            i = 1
            for d in data:
                # StopWordRemover
                factory = StopWordRemoverFactory()
                stopword = factory.create_stop_word_remover()

                # Tokenize
                stop = nltk.tokenize.word_tokenize(stopword.remove(str(d)))

                # Stemmer
                factory = StemmerFactory()
                stemmer = factory.create_stemmer()
                katadasar = stemmer.stem(str(stop))

                kd.append(katadasar)
                self.update_progressbar.emit(i + 1)
                i = i + 1

            no = 0
            with open('post-preprocessing.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
                spamwriter = csv.writer(csvfile)
                spamwriter.writerow(["teks"])
                for d in kd:
                    spamwriter.writerow([kd[no]])
                    no = no + 1

            status = False
def preprocessing_text(text):

    encoded_string = text.encode("ascii", "ignore")  #remove asci
    text = encoded_string.decode()  #remove asci

    text = text.lower()  #lowercase

    text = ''.join([i for i in text if not i.isdigit()])  #remove number

    #text = ''.join([i for i in text if i not in text.punctuation])

    text = re.sub(r'http\S+', '', text)  #remove url

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    text = stopword.remove(text)

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)

    text = re.sub("[^\w\s]", '', text)  #remove punctuation
    text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text)  #remove punctuation

    return text
예제 #6
0
def blogging():
    # Insert to Blog DB
    _title = request.form["title"]
    _content = request.form["content"]
    _link = request.form["link"]
    _date = request.form["date"]
    # Normalizing
    _title = _title.replace("'", "\"")
    _content2 = _content.replace("'", "")
    _content = _content.replace("'", "")

    # ***PRE-PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _content2 = stopword.remove(_content2)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _content2 = stemmer.stem(_content2)

    BlogRepository().insert(Blog(_title, _content, _link, _content2, _date))

    # Insert to Tf DB
    return render_template('index.html')
예제 #7
0
def preprocessing_text(text):

    encoded_string = text.encode("ascii", "ignore")  #remove asci
    text = encoded_string.decode()  #remove asci

    text = re.sub(r'http\S+', '', text)  #remove url

    text = text.lower()  #lowercase

    text = ''.join([i for i in text if not i.isdigit()])  #remove number

    #text = ''.join([i for i in text if i not in text.punctuation])
    #text = re.sub(r'[/(){}\[\]\|@,;#_]', '', text) #remove punctuation

    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*“”‘’_~+=|\t\n'''

    for char in text:
        if char in punctuations:
            text = text.replace(char, "")

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    text = stopword.remove(text)

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)

    return text
예제 #8
0
    def word_features(self, words):
        words = re.sub(r'(<.+>)|\W', ' ', words)
        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        stop = stopword.remove(words)
        words = stemmer.stem(stop)

        return dict([(word.lower(), True) for word in words.split()])
예제 #9
0
def cleanTweets(Tweets):
    factory = StopWordRemoverFactory(); stopwords = set(factory.get_stop_words()+['twitter','rt','pic','com','yg','ga','https'])
    factory = StemmerFactory(); stemmer = factory.create_stemmer()
    for i,tweet in enumerate(tqdm(Tweets)):
        txt = tweet['fullTxt'] # if you want to ignore retweets  ==> if not re.match(r'^RT.*', txt):
        txt = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',txt)# clean urls
        txt = txt.lower() # Lowercase
        txt = Tokenizer.tokenize(txt)
        symbols = set(['@']) # Add more if you want
        txt = [strip_non_ascii(t,symbols) for t in txt] #remove all non ASCII characters
        txt = ' '.join([t for t in txt if len(t)>1])
        Tweets[i]['cleanTxt'] = txt # this is not a good Python practice, only for learning.
        txt = stemmer.stem(txt).split()
        Tweets[i]['nlp'] = ' '.join([t for t in txt if t not in stopwords])
    return Tweets
예제 #10
0
    def process(text):
        # Normalizing
        _query = text.replace("'", "")

        # ***PRE-PROCESSING***
        # Stopword
        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()
        _query = stopword.remove(_query)

        # Stemming
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        _query = stemmer.stem(_query)
        
        return _query
예제 #11
0
def blogging():
    # Insert to Blog DB
    _question = request.form["question"]
    _answer = request.form["answer"]
    faq = FAQRepository().insert(Faq(0, _question, _answer))

    # Normalizing
    _question = _question.replace("'", "")
    _answer = _answer.replace("'", "")

    # ***PRE-PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _question = stopword.remove(_question)
    _answer = stopword.remove(_answer)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _question = stemmer.stem(_question)
    _answer = stemmer.stem(_answer)

    # Get all unique word from question
    blob = tb(_question)
    uniqWord = list(set(blob.words))

    # Count all unique word in question
    sumOfWord = 0
    for word in uniqWord:
        _n = blob.words.count(word)
        sumOfWord += _n

    # Get Average
    average = sumOfWord / len(blob)

    # Get Over Average Word
    for word in uniqWord:
        n = blob.words.count(word)
        if (n > average):
            # Insert to Keyword DB
            KeywordRepository().insert(Keyword(faq.id_faq, word, n))

    return render_template('faq.html')
예제 #12
0
# import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# import StopWordRemoverFactory class
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

import re, csv

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

factory = StemmerFactory()
stemmer = factory.create_stemmer()

num_lines = sum(1 for line in open(
    'c:/xampp/htdocs/efasonline/python/training/training_teknologi.csv'))
file = [[0 for x in range(5)] for y in range(num_lines)]

#save file
savemyFile = open(
    'c:/xampp/htdocs/efasonline/python/training/training_preprocess_teknologi.csv',
    'w',
    newline='')
# with savemyFile:
writer = csv.writer(savemyFile,
                    delimiter=';',
                    lineterminator='\r\n',
                    quoting=csv.QUOTE_ALL)

with open('c:/xampp/htdocs/efasonline/python/training/training_teknologi.csv',
          newline='') as myFile:
    reader = csv.reader(myFile, delimiter=';', quoting=csv.QUOTE_ALL)
    def run(self):
        i = 1
        status = True
        while status:
            df = pd.read_csv('data.csv', sep=",")
            dataAfter = pd.read_csv('post-preprocessing.csv', sep="|")
            data = self.query.split(" ")

            # Query Kata Dasar
            querykd = []
            for d in data:
                # StopWordRemover
                factory = StopWordRemoverFactory()
                stopword = factory.create_stop_word_remover()

                # Tokenize
                stop = nltk.tokenize.word_tokenize(stopword.remove(str(d.lower())))

                # Stemmer
                factory = StemmerFactory()
                stemmer = factory.create_stemmer()
                katadasar = stemmer.stem(str(stop))

                querykd.append(katadasar)
                # print(str(i)+ str(','), end='')
                # print(d)
                # i=i+1

            # TF - DF
            termFrequency = []
            dokumenfrequency = []
            for index, term in enumerate(querykd):
                countDokumen = []
                countDokumenFrequency = 0
                for dokumen in dataAfter['teks']:
                    count = 0
                    for kata in dokumen.split(' '):
                        if kata == term:
                            count += 1
                    countDokumen.append(count)

                    if count > 0:
                        countDokumenFrequency += 1
                termFrequency.append(countDokumen)
                dokumenfrequency.append(countDokumenFrequency)

            # IDF + 1
            idfSatu = []
            jumlahDokumen = len(df)
            for i in dokumenfrequency:
                idfSatu.append(log(jumlahDokumen / (i + 1)))

            # Pembobotan TFIDF (bisa di ganti COSIM)
            weight = termFrequency[:]
            for i in range(len(weight)):
                for j in range(len(weight[i])):
                    weight[i][j] = termFrequency[i][j] * idfSatu[i]

            # Menghitung total bobot dokumen
            jumlahWeight = []
            for i in range(len(weight[0])):
                jumlahWeight.append([i, 0])

            for i in range(len(weight)):
                for j in range(len(weight[i])):
                    jumlahWeight[j][1] += weight[i][j]

            # sorting bobot dokumen tertinggi = relevan
            JumlahWeight = sorted(jumlahWeight, key=itemgetter(1), reverse=True)

            for i in range(0, 20):
                print(str(i + 1) + str('. ') + str(df['judul'][JumlahWeight[i][0]]) + " | " + str(JumlahWeight[i][1]))

            status = False
예제 #14
0
def cosine():
    rank = []
    score = []
    query = request.form["query"]
    # ***PRE PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _query = stopword.remove(query)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _query = stemmer.stem(_query)

    # TF Query
    listTf = []
    blob = tb(_query)
    uniqWord = list(set(blob.words))
    # Desc : GET BLOG TITLE WHICH THE CONTENT CONTAINT QUERY WORD
    listTitleBlog = []
    for word in uniqWord:
        try:
            blogList = BlogRepository().getByWord(word)
            for t in blogList:
                listTitleBlog.append(t["title"])
        except:
            print(word, "not available")

    if (len(listTitleBlog) == 0):
        return render_template("result.html", rank=rank, query=query)

    listTitleBlog = list(set(listTitleBlog))  #Unique Blog Title

    blogAll = []
    for l in listTitleBlog:
        blogAll.append(BlogRepository().getByTitle(l))

    # *** COSINE SIMILIARITY ***
    # Scoring
    for blog in blogAll:
        # Get Set of Article and Query
        combined = _query + blog["tf"]
        blob = tb(combined)
        uniqWord = list(set(blob.words))
        # Count on two array how many word over there
        bQuery = tb(_query)
        bBlog = tb(blog["tf"])
        cQuery = []
        cBlog = []
        for word in uniqWord:
            _nQ = bQuery.words.count(word)
            cQuery.append(_nQ)
            _nB = bBlog.words.count(word)
            cBlog.append(_nB)
        # print(blog["title"])
        # print(cQuery)
        # print(cBlog)
        result = 1 - spatial.distance.cosine(cQuery, cBlog)
        score.append(result)
    lenScore = len(score)
    while (lenScore > 0):
        bestIndex = score.index(max(score))
        rank.append(blogAll[bestIndex])
        del score[bestIndex]
        del blogAll[bestIndex]
        lenScore = len(score)

    return render_template("result.html", rank=rank, query=query)
예제 #15
0
def result_blog():
    score = []
    rank = []

    query = request.form["query"]
    # ***PRE PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _query = stopword.remove(query)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _query = stemmer.stem(_query)

    # TF Query
    listTf = []
    blob = tb(_query)
    uniqWord = list(set(blob.words))
    for word in uniqWord:
        _n = blob.words.count(word)
        listTf.append(Tf("query", word, _n))

    # *** SCORING PROCESS ***
    # TF
    # Desc : GET BLOG TITLE WHICH THE CONTENT CONTAINT QUERY WORD
    listTitleBlog = []
    for word in uniqWord:
        try:
            blogList = BlogRepository().getByWord(word)
            for t in blogList:
                listTitleBlog.append(t["title"])
        except:
            print(word, "not available")

    if (len(listTitleBlog) == 0):
        return render_template("result.html", rank=rank, query=query)

    listTitleBlog = list(set(listTitleBlog))  #Unique Blog Title

    listBlog = []
    for l in listTitleBlog:
        listBlog.append(BlogRepository().getByTitle(l))

    # IDF
    blobList = []
    for blog in listBlog:
        content = blog["content"]
        blobList.append(tb(content))

    idfList = []
    for word in uniqWord:
        idfList.append(idf(word, blobList))

    # Scoring
    for title in listTitleBlog:
        result = 0
        for i, word in enumerate(uniqWord):
            try:
                # if word available
                # Counting Word
                blogData = BlogRepository().getByTitle(title)
                _content = blogData["tf"]
                blob = tb(_content)
                _n = blob.words.count(word)
                result = result + (idfList[i] * _n)
            except:
                print(word, "not available")
        score.append(result)
    lenScore = len(score)
    while (lenScore > 0):
        bestIndex = score.index(max(score))
        rank.append(listBlog[bestIndex])
        del score[bestIndex]
        del listBlog[bestIndex]
        lenScore = len(score)
    # print(rank)
    return render_template("result.html", rank=rank, query=query)
    def scrappingData(request, data, jmlDataScrapping):

        search = data
        jumlahData = int(jmlDataScrapping)
        print(data)
        nltk.download('punkt')
        nltk.download('stopwords')

        # %matplotlib inline
        RANDOM_SEED = 42
        np.random.seed(RANDOM_SEED)

        chrome_path = r"C:\Users\Rifqi Rosidin\Documents\za\chromedriver_win32\chromedriver.exe"
        driver = webdriver.Chrome(chrome_path)

        driver.get('https://play.google.com/store/search?q=' + search +
                   '&c=apps' + '&hl=in')
        tes = driver.find_element_by_xpath(
            "//*[@id='fcxH9b']/div[4]/c-wiz/div/div[2]/div/c-wiz/c-wiz[1]/c-wiz/div/div[2]/div[1]/c-wiz/div/div/div[1]/div/div/a"
        )
        tes.click()
        time.sleep(5)
        tes1 = driver.find_element_by_xpath(
            "//*[@id='fcxH9b']/div[4]/c-wiz[2]/div/div[2]/div/div[1]/div/div/div[1]/div[6]/div/span/span"
        )
        tes1.click()

        time.sleep(4)

        count = 1
        i = 1
        while i < 5:
            try:
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                if ((i % 5) == 0):
                    driver.execute_script('window.scrollTo(1, 2000);')
                    time.sleep(2)
                    tes2 = driver.find_element_by_xpath(
                        "//*[@id='fcxH9b']/div[4]/c-wiz[3]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span"
                    )
                    tes2.click()
                print("scroll ke -" + str(count))
                i += 1
                count += 1
            except:
                print("skip scrol")
                i += 1
                count += 1
        print('udah scrolling')
        a = 'test1'
        b = 1
        c = []
        b = 1
        d = 0
        errorNumber = 0
        driver.execute_script('window.scrollTo(1, 10);')

        while a != 'test':
            d = 2
            try:
                tes3 = driver.find_element_by_xpath(
                    "//*[@id='fcxH9b']/div[4]/c-wiz[3]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[1]/div["
                    + str(b) + "]/div/div[2]/div[2]/span[1]/div/button")
                tes3.click()
            except NoSuchElementException:
                d = 1

            try:
                tes4 = driver.find_element_by_xpath(
                    "/html/body/div[1]/div[4]/c-wiz[3]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div["
                    + str(b) + "]/div/div[2]/div[2]/span[" + str(d) + "]")

                # print(str(b) + tes4.text)
                print("review ke - " + str(b))
                c.append(tes4.text)

                if (int(b) >= jumlahData):
                    a = 'test'
                b += 1
                errorNumber += 1
            except:
                print(jumlahData)
                errorNumber += 1
                if (int(errorNumber) >= jumlahData):
                    a = 'test'
                b += 1

    #akhir tahap scrape data------------------------------------

        print(len(c))

        # hapus komentar
        data = pd.DataFrame({"ulasan": c})
        df = data['ulasan']
        ulasan = []
        x = 0
        y = ''
        for i in df:
            emoji = i.replace('emoji', '')
            if emoji.isspace():
                ulasan.append(x)
            else:
                y = 'tess'
            x += 1
        komentar = data.drop(ulasan)

        print("--------------hapus Emoji-------")
        print(komentar)

        #tahap melakukan case folding dan angka dan whitespace()
        case = []
        for i in komentar['ulasan']:
            b = re.sub(r'', '', str(i))
            a = b.lower()  #menjadikan huruf kecil
            c = re.sub(r'[0-9]+', '', a)  #menghilangkan angka
            d = c.strip()  #menghapus whitecase
            e = d.translate(str.maketrans(
                "", "", string.punctuation))  #menghilangkan karakter
            case.append(e)

        komentar['ulasan'] = case
        print("\n")
        print("-------case folding dan angka dan whirespace---------")
        print(komentar)

        token = []

        for i in komentar['ulasan']:
            tokens = nltk.tokenize.word_tokenize(str(i))
            token.append(tokens)
        token

        #akhir tahap pemisahan teks menjadi potongan-potongan
        def listToString(s):

            # initialize an empty string
            str1 = ""

            # traverse in the string
            for i in s:
                str1 += str(i)

            # return string
            return str1

        kata = listToString(token)

        #add stopword
        print("-------pemisahan teks menjadi potongan-potongan ---------")
        print(kata)

        #add stopword

        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = ['yg', 'tp']  #menambahkan stopword
        print(stopwords)

        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()

        word = []
        for i in komentar['ulasan']:
            stop = stopword.remove(str(i))
            tokens = nltk.tokenize.word_tokenize(stop)
            word = stop_factory + more_stopword
            word.append(tokens)
        # word

        #akhir add stopword

        #menjadikan kata ke bentuk dasarnya

        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        Hasil = []

        for i in komentar['ulasan']:
            hasil = stemmer.stem(str(i))
            Hasil.append(hasil)
        Hasil

        kata = listToString(Hasil)

        print("------PROSES LABELING--------")
        config = dict()
        config["negation"] = True
        config["booster"] = True
        config["ungkapan"] = True
        config["consecutive"] = True
        config["repeated"] = True
        config["emoticon"] = True
        config["question"] = True
        config["exclamation"] = True
        config["punctuation"] = True

        senti = label.sentistrength(config)

        print(len(Hasil))
        dt = pd.DataFrame({"ulasan": Hasil})
        dt.head(len(Hasil))

        sentim = []
        for i in dt['ulasan']:
            x = senti.main(i)
            sentim.append(x['kelas'])

        dt['label'] = sentim

        X = komentar['ulasan'].values
        y = dt['label'].values

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_SEED)

        MNB = MultinomialNaiveBayes(classes=np.unique(y),
                                    tokenizer=Tokenizer()).fit(
                                        X_train, y_train)

        #akurasi algoritma
        y_hat = MNB.predict(X_test)
        akurasi = accuracy_score(y_test, y_hat)
        print("akurasi")
        print(accuracy_score(y_test, y_hat))
        dt['akurasi'] = akurasi
        response = dt.to_dict()
        return response