import re, os, numpy as np
from collections import Counter
from string import punctuation as punct
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()
factory = StopWordRemoverFactory()
stop_words = factory.get_stop_words()

factory = StemmerFactory()
stemmer = factory.create_stemmer()


def punct_except(p):
    return ''.join(set(punct) - set(p))


def remove_punc(match):
    word = match.group(0)
    neww = word[1:-1]
    for p in '-/':
        neww = neww.replace(p, ' ')
    return ' ' + neww + ' '


def remove_dash(match):
    word = match.group(0)
    return word[1:]
Esempio n. 2
0
# tokenization
from nltk.tokenize import sent_tokenize, word_tokenize

# stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

#stopword removal
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
getStopWord = factory.get_stop_words()

#import custom lexicon
from lexiconCustom import lexCustom


# -------------import excel dataset-------------
def importExcelDataSet(selectedSheet):
    hasil = []
    labelManual = []
    for i in range(2, 105):
        if (selectedSheet.cell(row=i, column=7).value == None):
            break
        else:
            hasil.append(selectedSheet.cell(row=i, column=7).value)
            labelManual.append(selectedSheet.cell(row=i, column=11).value)
        document = re.sub(r'\W', ' ', text)
        document1 = re.sub(r'\s+[a-zA-Z]\s+', '', document)
        document2 = re.sub(r'\^[a-zA-Z]\s+', '', document1)
        document3 = re.sub(r"\d+", "", document2)
        document4 = re.sub(r'\s+', ' ', document3, flags=re.I)
        document5 = re.sub(r'^b\s+', '', document4)
        document5 = remove_all_extra_spaces(document5)
        document6 = document5.lower()
        document7 = document6.split()
        document8 = stemmer.stem(document6)
        document9 = ''.join(document8)
        documents.append(document9)
    Wtd = documents

    factory = StopWordRemoverFactory()
    stopword1 = factory.get_stop_words()
    more_stopwords = [
        'mohon', 'yang', 'berdasarkan', 'ada', 'kepala', 'timur', 'kantor',
        'pt', ' pt', 'pihak', 'bidang', 'ata', 'ii', 'iii', 'tertanggal',
        'melalui', 'jo', 'menjadi', 'terletak', 'tidak', 'ganti', 'di', 'pn',
        'sdr', 'res', 'tgl', 'mengenai', 'tahun', 'su', 'ri', 'ix', 'atas',
        'melalui', 'tanggapan', 'tentang', 'diduga', 'kec', 'adanya', 'ada',
        'tengah', 'pernyataan', 'tembusan', 'sesuai', 'ii', 'iii', 'iiiin',
        'iv', 'ix', 'vi', 'vii', 'viii', 'xii', 'xiii', 'selua', 'sh', 'bapak',
        '', 'hgu', 'ma', 'su', 'ham', 'perihal', 'milik', 'satu', 'tidak',
        ' narada', ' di', 'narada ', 'nomor', 'atas', 'pk', 'okt', 'agustus',
        'juli', 'april', 'terhadap', 'kedua', 'jaya', 'untuk', 'bin', 'upaya',
        'melalui', 'tentang', 'februari', 'dilakukan', 'pusat', 'selatan',
        'atas', 'data', 'lp', 'dalam', 'juni', 'adanya', 'mengenai', 'jkt',
        'atau', 'jawaban', 'tinggi', 'telah', 'maret', 'bapak', 'oktober',
        'januari', 'juli', 'mei', 'september', 'xi', 'agung', 'ada', 'dengan',