예제 #1
0
파일: utils.py 프로젝트: yighu/ML_Project
def stop_stem_remover(kalimat):
    """
    membersihkan stop word dan melakukan seemming

    input : 
    kalimat : kalimat di dalam corpus

    return
    kalimat
    """
    #buang kata tidak terlalu penting
    #factory = StopWordRemoverFactory()
    stop_factory = StopWordRemoverFactory().get_stop_words()
    add_stop_word = ['dkk', 'et', 'al', 'all'] #tambah manual stopwords
    stop = stop_factory + add_stop_word
    dicts = ArrayDictionary(stop)

    all_stop = StopWordRemover(dicts)
    kalimat = all_stop.remove(kalimat)

    #stemming (menjadi kata dasar)
    stemmerFactory = StemmerFactory()
    stemmer = stemmerFactory.create_stemmer()

    kalimat = stemmer.stem(kalimat)
    return kalimat
예제 #2
0
def removeStopWord(query):
    factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['!', '.', ',', '?']
    data = factory + more_stopword
    dic = ArrayDictionary(data)
    stopword = StopWordRemover(dic)
    return stopword.remove(query)
예제 #3
0
 def __init__(self):
     with open('./stopwords.txt') as f:
         more_stopword=f.read().split('\n')
     
     SWfactory = StopWordRemoverFactory()
     stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words())
     self.stopword = StopWordRemover(stopword_data)
    def _processTweet(self, tweet):
        punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~'''
        tweet = tweet.lower()  # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',
                       tweet)  # remove URLs
        tweet = re.sub('@[^\s]+', '', tweet)  # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
        tweet = "".join(
            (char for char in tweet if char not in string.punctuation))
        tweet = re.sub('\s+', ' ', tweet).strip()
        tweet = re.sub(r"\d", "", tweet)
        # Ambil Stopword bawaan
        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = open("stopword.txt", "r").read().split()
        # Merge stopword
        data = stop_factory + more_stopword
        dictionary = ArrayDictionary(data)
        str = StopWordRemover(dictionary)

        factory1 = StemmerFactory()  #stemming factory
        stemmer = factory1.create_stemmer()  #buat stemming
        #
        tweet = str.remove(tweet)
        # tweet = stemmer.stem(tweet)  # stemming tweet
        tweet = word_tokenize(
            tweet)  # remove repeated characters (helloooooooo into hello)
        # return [word for word in tweet if word not in self._stopwords]
        return tweet
예제 #5
0
 def Stopword(doc):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['ini', 'itu', 'the']
     data = stop_factory + more_stopword
     dictionary = ArrayDictionary(data)
     data_str = StopWordRemover(dictionary)
     dokumen = data_str.remove(doc)
     return dokumen
예제 #6
0
 def stopword(self):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya']
     data = stop_factory + more_stopword
      
     stop_factory = StopWordRemoverFactory()
     dictionary = ArrayDictionary(data)
     self.stopword = StopWordRemover(dictionary)
예제 #7
0
    def __stopward_removal(self, tokens):
        stop_factory = StopWordRemoverFactory().get_stop_words()

        more_stopword = ['dong', 'atuh', 'plis']

        data = stop_factory + more_stopword

        dictionary = ArrayDictionary(data)

        str_remove = StopWordRemover(dictionary)

        tokens = word_tokenize(str_remove.remove(' '.join(tokens)))

        return tokens
예제 #8
0
 def __filtering_sastrawi(self, documents):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     list_stop = stop_factory + self.stop_more
     dictionary = ArrayDictionary(list_stop)
     stopwords = StopWordRemover(dictionary)
     stop = stopwords.remove(documents)
     return stop
예제 #9
0
class Stemmer:
    def __init__(self):
        self.stemmer()
        self.stopword()

    def stemmer(self):
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()

    def stopword(self):
        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya']
        data = stop_factory + more_stopword
         
        stop_factory = StopWordRemoverFactory()
        dictionary = ArrayDictionary(data)
        self.stopword = StopWordRemover(dictionary)

    def stem(self, sentence = None):       
        sentence = self.stemmer.stem(sentence)

        return sentence

    def remove(self, sentence = None):
        sentence = self.stopword.remove(sentence)

        return sentence
예제 #10
0
def stopword(text):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    print(stop_factory)
    more_stopword = ['diatur', 'perjodohan']

    # Merge stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    hasil = str.remove(text)
    # print(hasil)

    return hasil
예제 #11
0
def pre_processing(text):
    stopwords = pd.read_csv('stopwordbahasa.csv', names=['stopword'])['stopword'].tolist()

    stem = StemmerFactory() 
    stemmer = stem.create_stemmer()
    factory = StopWordRemoverFactory()
    stopword = StopWordRemover(ArrayDictionary(factory.get_stop_words() + stopwords))

    clean_str = text.lower() # lowercase
    clean_str = re.sub(r"(?:\@|#|https?\://)\S+", " ", clean_str) # eliminate username, url, hashtags
    clean_str = re.sub(r'&amp;', '', clean_str) # remove &amp; as it equals &
    clean_str = re.sub(r'[^\w\s]',' ', clean_str) # remove punctuation
    clean_str = re.sub('[\s\n\t\r]+', ' ', clean_str) # remove extra space
    clean_str = clean_str.strip() # trim
    clean_str = " ".join([stemmer.stem(word) for word in clean_str.split()]) # stem
    clean_str = stopword.remove(clean_str) # remove stopwords
    return clean_str
예제 #12
0
 def preprocess_sentence(self, q=""):
     #tokenize, lower, stopword,stem
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(dictionary)
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     tokenizer = RegexpTokenizer(r'\w+')
     res = " ".join(tokenizer.tokenize(q))
     res = res.lower()
     res = stopword.remove(res)
     res = factory = stemmer.stem(res)
     return res
예제 #13
0
def generateStopWords(pat, txt):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopwords = [' ?', '?', ' .', '.', ' ,', ',']
    # Merge stopword
    data = stop_factory + more_stopwords

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    temppat = str.remove(pat)
    if (temppat == '' or temppat == None):
        temppat = pat

    temptxt = str.remove(txt)
    if (temptxt == '' or temptxt == None):
        temptxt = txt

    return temppat, temptxt
예제 #14
0
def remove_stopwords_id(kalimat):
    # ambil stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['daring', 'online', 'nih']

    # menggabungkan stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    string = StopWordRemover(dictionary)
    tokens = nltk.tokenize.word_tokenize(string.remove(kalimat))
    return (" ".join(tokens))
예제 #15
0
 def remove_stopwords(self,
                      csv_src="",
                      csv_dest="",
                      cols_to_clean=["KOMPETENSI"],
                      sep=";"):
     #factory = StopWordRemoverFactory()
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(
         dictionary
     )  #factory.create_stop_word_remover(dictionary = dictionary)
     tokenizer = RegexpTokenizer(r'\w+')
     df = pd.read_csv(csv_src, sep=sep)
     for c in cols_to_clean:
         df[c] = df[c].map(lambda x: " ".join(tokenizer.tokenize(x))
                           )  #get only words without symbols
         df[c] = df[c].map(lambda x: stopword.remove(x))  #remove stop words
     df.to_csv(csv_dest, sep=sep, index=None)
     print("lower %d rows" % len(df))
예제 #16
0
 def __init__(self, tweet):
     self.tweet = tweet
     stop_factory = StopWordRemoverFactory().get_stop_words()
     stop_factory = stop_factory + self.additional_stopwords
     dictionary = ArrayDictionary(stop_factory)
     self.strword = StopWordRemover(dictionary)
예제 #17
0
class TweetProcessing():
    user_pattern = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)'
    url_pattern = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})'
    url_pattern2 = r'https://t.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,}'
    #url_pattern = '^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$'
    digit_pattern = r'^\d+\s|\s\d+\s|\s\d+$'
    rt_pattern = r'RT\s:*'
    additional_stopwords = [
        'cc ', 'cc:', 'cc.', 'a', 'd', 'g', 'e', 'y', 'ga', 'gmn', 'tdk',
        'nah', 'sih', 'blm', 'ni', 'di', 'sy', 'sya', 'rt', 'jl', 'jl.', 'jln',
        'jln.', 'no', 'no.', 'dlm', 'tx', 'thx', 'he', 'd', 'k', 'sm'
    ]

    def __init__(self, tweet):
        self.tweet = tweet
        stop_factory = StopWordRemoverFactory().get_stop_words()
        stop_factory = stop_factory + self.additional_stopwords
        dictionary = ArrayDictionary(stop_factory)
        self.strword = StopWordRemover(dictionary)

    def set_tweet(self, tweet):
        self.tweet = tweet

    def get_tweet(self, tweet):
        return self.tweet

    def clean_up_tweet_usernames(self):
        return re.sub(self.user_pattern, '', self.tweet)

    def clean_up_tweet_url(self):
        self.tweet = re.sub(self.url_pattern, '', self.tweet)
        self.tweet = self.tweet.replace("https://t.?", '')
        self.tweet = self.tweet.replace("https://t?", '')
        self.tweet = self.tweet.replace("https://?", '')
        return re.sub(self.url_pattern2, '', self.tweet)

    def clean_up_tweet_rt(self):
        return re.sub(self.rt_pattern, '', self.tweet)

    def clean_up_tweet_digits(self):
        self.tweet = ''.join([i for i in str(self.tweet) if not i.isdigit()])
        return self.tweet
        #return re.sub(self.digit_pattern,'', self.tweet)

    def remove_stop_words(self):
        self.tweet = self.strword.remove(self.tweet)
        return self.tweet

    def stemming_tweet(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        self.tweet = stemmer.stem(self.tweet)
        return self.tweet

    def clean_up_tweet(self):
        self.tweet = self.tweet.lower()
        #self.tweet = self.clean_up_tweet_usernames()
        self.tweet = self.clean_up_tweet_url()
        self.tweet = self.clean_up_tweet_rt()
        self.tweet = self.clean_up_tweet_digits()
        #self.tweet = self.tweet.replace('.',' ')
        self.tweet = self.tweet.replace(',', ' ')
        self.tweet = self.tweet.replace('?', '')
        self.tweet = self.tweet.replace('  ', ' ')
        self.tweet = self.stemming_tweet()
        self.tweet = self.remove_stop_words()
        self.tweet = self.tweet.translate(string.punctuation)
        if self.tweet.startswith('"') and self.tweet.endswith('"'):
            self.tweet = self.tweet[1:-1]

        return self.tweet
예제 #18
0
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
stop_word = StopWordRemoverFactory().get_stop_words()

more_stopword = [
    'yg', 'ajah', 'iya', 'mba', 'mas', 'kak', 'pak', 'pahi', 'mah', 'muehehe',
    'men', 'kehfine', 'alhamdulilah', 'alhamdulillah', 'nih', 'om', 'selamat',
    'sama', 'sabar', 'gak', 'yak', 'semoga'
    'bu', 'adik', 'omen', 'tumben', 'tp', 'sy', 'kmu', 'jg', 'kyk', 'dll'
]
d_sword = stop_word + more_stopword
dictionary = ArrayDictionary(d_sword)
swr = StopWordRemover(dictionary)

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
pat3 = '(RT)'
combined_pat = r'|'.join((pat1, pat2, pat3))
df_t = df['text']


def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8").replace(u"\ufffd", "?")
    except:
                     'cari', 'bisa', 'banyak', 'bagaimana', 'beberapa', 'bagi',
                     'bertanya', 'baru', 'berapa', 'baik', 'bagian', 'banyaknya',
                     'bukan', 'buka', 'buaya', 'boleh', 'bernama', 'berapakah', 
                     'berada', 'bilang', 'begitu', 'bahkan', 'nya', 'nyata', 
                     'nikita', 'noka', 'nanya', 'ngurah', 'ngasih', 'namum',
                     'namanya', 'masih', 'melakukan', 'mereka', 'merokok',
                     'memang', 'mana', 'malam', 'misal', 'melporkan', 'mdr',
                     'mas', 'mangga', 'mungkin', 'mundu', 'menurut',
                    ]
data = stop_factory + more_stopword
dictionary = ArrayDictionary(data)

# factory = StopWordRemoverFactory()
# data = factory.get_stop_words()+more_stopword
# stopword = factory.create_stop_word_remover()
stopword = StopWordRemover(dictionary)

tokenizer = Tokenizer()
nama_file = "D:\\resa\\D\\KULIAH\\S2\\Semester 1\\python\\mlNN_1\\datasetSMS\\data-set-50-persen.csv"
# df = pd.read_csv(nama_file, encoding = "ISO-8859-1").values
# data = df[:, 0]
# label = df[:, 1]

data = []
label = []
with open(nama_file, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    next(reader)  # skip header
    for row in reader:
        data.append(row[0])
        label.append(row[1])
예제 #20
0
    'melporkan',
    'mdr',
    'mas',
    'mangga',
    'mungkin',
    'mundu',
    'menurut',
]
data = stop_factory + more_stopword
# print(data)
dictionary = ArrayDictionary(data)

# factory = StopWordRemoverFactory()
# data = factory.get_stop_words()+more_stopword
# stopword = factory.create_stop_word_remover()
stopword = StopWordRemover(dictionary)

tokenizer = Tokenizer()
texts = [
    "Kepada, Yth. Pemerintah Provinsi DKI Jakarta, PT.Metaliska yang telah mendirikan bangunan di atas tanah bantaran kali (melanggar GSK) 300 m dari terminal bus Pulogadung arah ke Timur, mohon ditindak Terima kasih"
]
# kalimat = "Kepada, Yth. Pemerintah Provinsi DKI Jakarta, PT. Metaliska yang telah mendirikan bangunan di atas tanah bantaran kali (melanggar GSK) 300 m dari terminal bus Pulogadung arah ke Timur, mohon ditindak Terima kasih"
kalimat = input("Masukkan Kalimat : ")
print('==================================')
print('Kalimat awal/asli : ')
print(str(kalimat) + '\n')
print("1. hasil proses case folding :")
print(kalimat.lower())
print('\n')
print('2. hasil proses menghilangkan tanda baca/filtering :')
hasil = kalimat.lower().translate(str.maketrans("", "", string.punctuation))
예제 #21
0
def api_echo():
    if request.method == 'POST':

        # create stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        factory = StopWordRemoverFactory()

        more_stopword = []
        # add stopword
        with open('dataset/stopword.csv') as csvfile:
            readCSV = csv.reader(csvfile, delimiter=',')
            for row in readCSV:
                more_stopword.append(row[0])

        dictionary = ArrayDictionary(more_stopword)
        str = StopWordRemover(dictionary)

        newsTrainer = Trainer(tokenizer)

        kesehatan = []
        konsultasi = []
        marketing = []

        with open("dataset/kesehatan.txt", "r") as ins:
            for line in ins:
                kesehatan.append({
                    'text': line.rstrip(),
                    'category': 'kesehatan'
                })

        with open("dataset/konsultasi.txt", "r") as ins:
            for line in ins:
                konsultasi.append({
                    'text': line.rstrip(),
                    'category': 'konsultasi'
                })

        with open("dataset/marketing.txt", "r") as ins:
            for line in ins:
                marketing.append({
                    'text': line.rstrip(),
                    'category': 'marketing'
                })

        # You need to train the system passing each text one by one to the trainer module.
        newsSet = kesehatan + konsultasi + marketing

        for news in newsSet:
            newsTrainer.train(news['text'], news['category'])

        # When you have sufficient trained data, you are almost done and can start to use
        # a classifier.
        newsClassifier = Classifier(newsTrainer.data, tokenizer)

        query = request.form['query'].encode("utf8")
        #query = "Apa saja level bonus yang didapat bagi seorang agen?"

        # stemming and remove stop word on Query
        out = stemmer.stem(query)
        out = str.remove(out)
        classification = newsClassifier.classify(out)

        # the classification variable holds the detected categories sorted
        #return classification[0][0]
        return jsonify(classification)
예제 #22
0
def createStopword(more_stopword=[]):
    stop_factory = StopWordRemoverFactory().get_stop_words()
    new_stop_word = stop_factory + more_stopword
    dictionary = ArrayDictionary(new_stop_word)
    stopword = StopWordRemover(dictionary)
    return stopword
예제 #23
0
tool = victorinox()
population1_dict = {}
population2_dict = {}
population_root_path = r"corpus/population"
population_files = glob(os.path.join(population_root_path, "**/*.txt"),
                        recursive=True)
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
default_stopwords = StopWordRemoverFactory().get_stop_words()
additional_stopwords = [
    "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu"
]
dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
id_stopword = StopWordRemover(dictionary)
en_stopword = set(stopwords.words('english'))
en_stemmer = PorterStemmer()


def remove_numbers(text):
    words = tokenizer.tokenize(text)
    return " ".join(words)


def remove_punctuation(text):
    words = text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    return " ".join(stripped)
예제 #24
0
class TextSummarization():
    def __init__(self):
        with open('./stopwords.txt') as f:
            more_stopword=f.read().split('\n')
        
        SWfactory = StopWordRemoverFactory()
        stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words())
        self.stopword = StopWordRemover(stopword_data)

    def Preprocessing(self, text):        
        clean = re.sub("#[^\W]+|@[^\W]+|http[^*\s]+|<[^>]*>|[0-9]", '', text) #cleansing data
        emoticons = re.findall('(?::|;|=)()(?:-)?(?:\)|\(|D|P)', clean)
        text = (re.sub('[\W]+', ' ', clean.lower()) +  #Case folding
                ' '.join(emoticons).replace('-', ''))
        result=''
        for kata in text.split(): 
            stop = self.stopword.remove(kata) #Stopword 
            result += f"{stop} " if stop else ''
        return result

    def Summary(self, doc, preprocess=False):
        doc_tokenizer = PunktSentenceTokenizer()
        sentences_list = doc_tokenizer.tokenize(doc)

        clean_sentences_list=[] 
        for sentence in sentences_list:
            clean_sentences_list.append(self.Preprocessing(sentence)) 

        cv = CountVectorizer()
        cv_matrix = cv.fit_transform(clean_sentences_list if preprocess else sentences_list)
        normal_matrix = TfidfTransformer().fit_transform(cv_matrix)

        tfidf=normal_matrix.toarray()
        res_graph = normal_matrix * normal_matrix.T # similaritas /adjacency matrix

        nx_graph= from_scipy_sparse_matrix(res_graph)
        pageranks = pagerank(nx_graph)

        sentence_array = sorted(((pageranks[i], s) for i, s in enumerate(sentences_list)), reverse=True)
        sentence_array = np.asarray(sentence_array)

        rank_max = float(sentence_array[0][0])
        rank_min = float(sentence_array[len(sentence_array) - 1][0])

        temp_array = []

        # Jika semua rank sama
        # taking any sentence will give the summary, say the first sentence
        flag = 0
        if rank_max - rank_min == 0:
            temp_array.append(0)
            flag = 1

        # If the sentence has different ranks
        if flag != 1:
            for i in range(0, len(sentence_array)):
                temp_array.append((float(sentence_array[i][0]) - rank_min) / (rank_max - rank_min))
        
        # Calculation of threshold:
        # We take the mean value of normalized scores
        # any sentence with the normalized score 0.2 more than the mean value is considered to be 
        threshold = (sum(temp_array) / len(temp_array))# + 0.2

        # Separate out the sentences that satiasfy the criteria of having a score above the threshold
        sentence_list = []
        if len(temp_array) > 1:
            for i in range(0, len(temp_array)):
                if temp_array[i] > threshold:
                        sentence_list.append(sentence_array[i][1])
        else:
            sentence_list.append(sentence_array[0][1])

        summary = " ".join(str(x) for x in sentence_list)

        return summary
예제 #25
0
    sentimen_count = df['sentiment'].value_counts()
    sentimen_count

    words_positif = ' '.join(df_positif['tweet_bersih'])
    words_negatif = ' '.join(df_negatif['tweet_bersih'])
    words_netral = ' '.join(df_netral['tweet_bersih'])

    # MORE STOPWORDS
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = StopwordsID.more_stopword

    # Merge stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    StopWordRemover(dictionary)
    stopwords = data

    mask = np.array(Image.open("shape.png"))

########################################################################################################

    while True:

        choice = displayMenu(menuItems)

        if choice == 1:
            print(df)

        elif choice == 2:
            objects = sentimen_count.index
예제 #26
0
import pandas
import pickle as pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from dateutil.parser import parse
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory , ArrayDictionary , StopWordRemover

factory = StopWordRemoverFactory()
a = list(factory.get_stop_words())
if "di" in a: a.remove("di")
if "adalah" in a: a.remove("adalah")    
dictionary = ArrayDictionary(a)
stopwordId = StopWordRemover(dictionary)

sf= StemmerFactory()
stemmerId = sf.create_stemmer() 

def date_detection(doc,fuzzy=True):
    try: 
        parse(doc, fuzzy=fuzzy)
        return True

    except ValueError:
        return False
    except :
        return False
    
def all_caps_detection(doc):
예제 #27
0
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from string_matching_algorithm import *
import re as regex
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# factory = StopWordRemoverFactory()
newStopFactory = StopWordRemoverFactory().get_stop_words()
newStopFactory.remove("sampai")
newStopFactory.remove("dan")
newStopFactory.append("deadline")
newStopFactory.append("mengenai")
newStopFactory.append("tanggal")
stopword = StopWordRemover(ArrayDictionary(newStopFactory))

# Regex untuk bulan
JANUARI_REGEX = '[Jj]an(?:uari)?'
FEBRUARI_REGEX = '[Ff]eb(?:ruari)?'
MARET_REGEX = '[Mm]ar(?:et)?'
APRIL_REGEX = '[Aa]pr(?:il)?'
MEI_REGEX = '[Mm]ei'
JUNI_REGEX = '[Jj]uni?'
JULI_REGEX = '[Jj]uli?'
AGUSTUS_REGEX = '[Aa]gu(?:stus)?'
SEPTEMBER_REGEX = '[Ss]ep(?:tember)?'
OKTOBER_REGEX = '[Oo]kt(?:ober)?'
NOVEMBER_REGEX = '[Nn]ov(?:ember)?'
DESEMBER_REGEX = '[Dd]es(?:ember)?'

# Regex untuk keutuhan tanggal
ANYTHING = '.*'
DAY_REGEX = '(0[1-9]|[1-2][0-9]|3[0-1])'