Python ArrayDictionary Examples, Sastrawi.StopWordRemover.StopWordRemoverFactory.ArrayDictionary Python Examples

Example #1

0

Show file

File: test1.py Project: fajaramaulana/sentimen-analisis-naive-bayes

    def _processTweet(self, tweet):
        punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~'''
        tweet = tweet.lower()  # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',
                       tweet)  # remove URLs
        tweet = re.sub('@[^\s]+', '', tweet)  # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
        tweet = "".join(
            (char for char in tweet if char not in string.punctuation))
        tweet = re.sub('\s+', ' ', tweet).strip()
        tweet = re.sub(r"\d", "", tweet)
        # Ambil Stopword bawaan
        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = open("stopword.txt", "r").read().split()
        # Merge stopword
        data = stop_factory + more_stopword
        dictionary = ArrayDictionary(data)
        str = StopWordRemover(dictionary)

        factory1 = StemmerFactory()  #stemming factory
        stemmer = factory1.create_stemmer()  #buat stemming
        #
        tweet = str.remove(tweet)
        # tweet = stemmer.stem(tweet)  # stemming tweet
        tweet = word_tokenize(
            tweet)  # remove repeated characters (helloooooooo into hello)
        # return [word for word in tweet if word not in self._stopwords]
        return tweet

Example #2

0

Show file

 def __filtering_sastrawi(self, documents):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     list_stop = stop_factory + self.stop_more
     dictionary = ArrayDictionary(list_stop)
     stopwords = StopWordRemover(dictionary)
     stop = stopwords.remove(documents)
     return stop

Example #3

0

Show file

 def __init__(self):
     with open('./stopwords.txt') as f:
         more_stopword=f.read().split('\n')
     
     SWfactory = StopWordRemoverFactory()
     stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words())
     self.stopword = StopWordRemover(stopword_data)

Example #4

0

Show file

File: utils.py Project: yighu/ML_Project

def stop_stem_remover(kalimat):
    """
    membersihkan stop word dan melakukan seemming

    input : 
    kalimat : kalimat di dalam corpus

    return
    kalimat
    """
    #buang kata tidak terlalu penting
    #factory = StopWordRemoverFactory()
    stop_factory = StopWordRemoverFactory().get_stop_words()
    add_stop_word = ['dkk', 'et', 'al', 'all'] #tambah manual stopwords
    stop = stop_factory + add_stop_word
    dicts = ArrayDictionary(stop)

    all_stop = StopWordRemover(dicts)
    kalimat = all_stop.remove(kalimat)

    #stemming (menjadi kata dasar)
    stemmerFactory = StemmerFactory()
    stemmer = stemmerFactory.create_stemmer()

    kalimat = stemmer.stem(kalimat)
    return kalimat

Example #5

0

Show file

def removeStopWord(query):
    factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['!', '.', ',', '?']
    data = factory + more_stopword
    dic = ArrayDictionary(data)
    stopword = StopWordRemover(dic)
    return stopword.remove(query)

Example #6

0

Show file

 def Stopword(doc):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['ini', 'itu', 'the']
     data = stop_factory + more_stopword
     dictionary = ArrayDictionary(data)
     data_str = StopWordRemover(dictionary)
     dokumen = data_str.remove(doc)
     return dokumen

Example #7

0

Show file

 def stopword(self):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya']
     data = stop_factory + more_stopword
      
     stop_factory = StopWordRemoverFactory()
     dictionary = ArrayDictionary(data)
     self.stopword = StopWordRemover(dictionary)

Example #8

0

Show file

def remove_stopwords_id(kalimat):
    # ambil stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['daring', 'online', 'nih']

    # menggabungkan stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    string = StopWordRemover(dictionary)
    tokens = nltk.tokenize.word_tokenize(string.remove(kalimat))
    return (" ".join(tokens))

Example #9

0

Show file

    def __stopward_removal(self, tokens):
        stop_factory = StopWordRemoverFactory().get_stop_words()

        more_stopword = ['dong', 'atuh', 'plis']

        data = stop_factory + more_stopword

        dictionary = ArrayDictionary(data)

        str_remove = StopWordRemover(dictionary)

        tokens = word_tokenize(str_remove.remove(' '.join(tokens)))

        return tokens

Example #10

0

Show file

File: vsm.py Project: ariefzzz/penambanganweb-crawler-Kmean

def stopword(text):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    print(stop_factory)
    more_stopword = ['diatur', 'perjodohan']

    # Merge stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    hasil = str.remove(text)
    # print(hasil)

    return hasil

Example #11

0

Show file

File: api_processing.py Project: jibrilhp/doc2vec-test

def pre_processing(text):
    stopwords = pd.read_csv('stopwordbahasa.csv', names=['stopword'])['stopword'].tolist()

    stem = StemmerFactory() 
    stemmer = stem.create_stemmer()
    factory = StopWordRemoverFactory()
    stopword = StopWordRemover(ArrayDictionary(factory.get_stop_words() + stopwords))

    clean_str = text.lower() # lowercase
    clean_str = re.sub(r"(?:\@|#|https?\://)\S+", " ", clean_str) # eliminate username, url, hashtags
    clean_str = re.sub(r'&amp;', '', clean_str) # remove &amp; as it equals &
    clean_str = re.sub(r'[^\w\s]',' ', clean_str) # remove punctuation
    clean_str = re.sub('[\s\n\t\r]+', ' ', clean_str) # remove extra space
    clean_str = clean_str.strip() # trim
    clean_str = " ".join([stemmer.stem(word) for word in clean_str.split()]) # stem
    clean_str = stopword.remove(clean_str) # remove stopwords
    return clean_str

Example #12

0

Show file

 def preprocess_sentence(self, q=""):
     #tokenize, lower, stopword,stem
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(dictionary)
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     tokenizer = RegexpTokenizer(r'\w+')
     res = " ".join(tokenizer.tokenize(q))
     res = res.lower()
     res = stopword.remove(res)
     res = factory = stemmer.stem(res)
     return res

Example #13

0

Show file

File: BM.py Project: pandyakaa/Simple-ChatBot

def generateStopWords(pat, txt):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopwords = [' ?', '?', ' .', '.', ' ,', ',']
    # Merge stopword
    data = stop_factory + more_stopwords

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    temppat = str.remove(pat)
    if (temppat == '' or temppat == None):
        temppat = pat

    temptxt = str.remove(txt)
    if (temptxt == '' or temptxt == None):
        temptxt = txt

    return temppat, temptxt

Example #14

0

Show file

 def remove_stopwords(self,
                      csv_src="",
                      csv_dest="",
                      cols_to_clean=["KOMPETENSI"],
                      sep=";"):
     #factory = StopWordRemoverFactory()
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(
         dictionary
     )  #factory.create_stop_word_remover(dictionary = dictionary)
     tokenizer = RegexpTokenizer(r'\w+')
     df = pd.read_csv(csv_src, sep=sep)
     for c in cols_to_clean:
         df[c] = df[c].map(lambda x: " ".join(tokenizer.tokenize(x))
                           )  #get only words without symbols
         df[c] = df[c].map(lambda x: stopword.remove(x))  #remove stop words
     df.to_csv(csv_dest, sep=sep, index=None)
     print("lower %d rows" % len(df))

Example #15

0

Show file

    exit()

start = time.time()
os.system('cls')

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# create stopword
stop_factory = StopWordRemoverFactory().get_stop_words()
more_stopword = ['halaman', 'kompas', 'com', 'all', '-']

# Merge stopword
stop_factory += more_stopword

dictionary = ArrayDictionary(stop_factory)
stopword = StopWordRemover(dictionary)

for i in range(total_documents):
    print(i, "/", total_documents, "documents cleaned")
    print("Cleaning...")
    try:
        with open("download/" + site + "/scrapped/" + site + "-" + str(i + 1) +
                  "-bersih.html",
                  'r',
                  encoding="utf8") as f:
            soup = bs(f, 'html.parser')

        url = soup.url.text
        title = soup.title.text
        top = soup.top.text

Example #16

0

Show file

    sentimen_count = df['sentiment'].value_counts()
    sentimen_count

    words_positif = ' '.join(df_positif['tweet_bersih'])
    words_negatif = ' '.join(df_negatif['tweet_bersih'])
    words_netral = ' '.join(df_netral['tweet_bersih'])

    # MORE STOPWORDS
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = StopwordsID.more_stopword

    # Merge stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    StopWordRemover(dictionary)
    stopwords = data

    mask = np.array(Image.open("shape.png"))

########################################################################################################

    while True:

        choice = displayMenu(menuItems)

        if choice == 1:
            print(df)

        elif choice == 2:

Example #17

0

Show file

File: tweetprocessing.py Project: mdja/tweet_classification

 def __init__(self, tweet):
     self.tweet = tweet
     stop_factory = StopWordRemoverFactory().get_stop_words()
     stop_factory = stop_factory + self.additional_stopwords
     dictionary = ArrayDictionary(stop_factory)
     self.strword = StopWordRemover(dictionary)

Example #18

0

Show file

def createStopword(more_stopword=[]):
    stop_factory = StopWordRemoverFactory().get_stop_words()
    new_stop_word = stop_factory + more_stopword
    dictionary = ArrayDictionary(new_stop_word)
    stopword = StopWordRemover(dictionary)
    return stopword

Example #19

0

Show file

import Sastrawi as sts
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
stop_word = StopWordRemoverFactory().get_stop_words()

more_stopword = [
    'yg', 'ajah', 'iya', 'mba', 'mas', 'kak', 'pak', 'pahi', 'mah', 'muehehe',
    'men', 'kehfine', 'alhamdulilah', 'alhamdulillah', 'nih', 'om', 'selamat',
    'sama', 'sabar', 'gak', 'yak', 'semoga'
    'bu', 'adik', 'omen', 'tumben', 'tp', 'sy', 'kmu', 'jg', 'kyk', 'dll'
]
d_sword = stop_word + more_stopword
dictionary = ArrayDictionary(d_sword)
swr = StopWordRemover(dictionary)

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
pat3 = '(RT)'
combined_pat = r'|'.join((pat1, pat2, pat3))
df_t = df['text']


def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8").replace(u"\ufffd", "?")

Example #20

0

Show file

File: classes.py Project: AndhikaRei/Otobot

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from string_matching_algorithm import *
import re as regex
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# factory = StopWordRemoverFactory()
newStopFactory = StopWordRemoverFactory().get_stop_words()
newStopFactory.remove("sampai")
newStopFactory.remove("dan")
newStopFactory.append("deadline")
newStopFactory.append("mengenai")
newStopFactory.append("tanggal")
stopword = StopWordRemover(ArrayDictionary(newStopFactory))

# Regex untuk bulan
JANUARI_REGEX = '[Jj]an(?:uari)?'
FEBRUARI_REGEX = '[Ff]eb(?:ruari)?'
MARET_REGEX = '[Mm]ar(?:et)?'
APRIL_REGEX = '[Aa]pr(?:il)?'
MEI_REGEX = '[Mm]ei'
JUNI_REGEX = '[Jj]uni?'
JULI_REGEX = '[Jj]uli?'
AGUSTUS_REGEX = '[Aa]gu(?:stus)?'
SEPTEMBER_REGEX = '[Ss]ep(?:tember)?'
OKTOBER_REGEX = '[Oo]kt(?:ober)?'
NOVEMBER_REGEX = '[Nn]ov(?:ember)?'
DESEMBER_REGEX = '[Dd]es(?:ember)?'

# Regex untuk keutuhan tanggal
ANYTHING = '.*'
DAY_REGEX = '(0[1-9]|[1-2][0-9]|3[0-1])'

Example #21

0

Show file

ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

tool = victorinox()
population1_dict = {}
population2_dict = {}
population_root_path = r"corpus/population"
population_files = glob(os.path.join(population_root_path, "**/*.txt"),
                        recursive=True)
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
default_stopwords = StopWordRemoverFactory().get_stop_words()
additional_stopwords = [
    "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu"
]
dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
id_stopword = StopWordRemover(dictionary)
en_stopword = set(stopwords.words('english'))
en_stemmer = PorterStemmer()


def remove_numbers(text):
    words = tokenizer.tokenize(text)
    return " ".join(words)


def remove_punctuation(text):
    words = text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    return " ".join(stripped)

Example #22

0

Show file

File: GUI_Klasifikasi.py Project: B0ndan/Orthogonal-Projections

        'melalui', 'tentang', 'februari', 'dilakukan', 'pusat', 'selatan',
        'atas', 'data', 'lp', 'dalam', 'juni', 'adanya', 'mengenai', 'jkt',
        'atau', 'jawaban', 'tinggi', 'telah', 'maret', 'bapak', 'oktober',
        'januari', 'juli', 'mei', 'september', 'xi', 'agung', 'ada', 'dengan',
        'kedua', 'di', 'selatan', 'nama', 'ada', 'terkait', 'tentang', 'yang',
        'nomor', 'tidak', 'dengan', 'terhadap', 'sept', 'november', 'nov',
        'dalam', 'atau', 'bapak', 'nama', 'kami', 'ada', 'melalui',
        'assalamualaikum', 'wr', 'wb', 'jp', 'lp', 'md', 'mh', 'melakukuan',
        'sbg', 'selasa'
        'oleh', 'segera', 'tahun', 'melakukan', 'oleh', 'agustus', 'atau',
        'dki', 'kab', 'belum', 'untuk', 'adanya', 'kecamatan', 'yang', 'yg',
        'memberikan', 'mengenai', 'ayat', 'tanggal', 'dan', 'bukan', 'dab',
        'dan', 'ke', 'qq'
    ]
    sw = stopword1 + more_stopwords
    dictionary = ArrayDictionary(sw)
    strw = StopWordRemover(dictionary)
    removestop = []
    for line in Wtd:
        word_token = nltk.word_tokenize(line)
        word_token = [word for word in word_token if not word in sw]
        removestop.append(" ".join(word_token))
    doc_clean = removestop

    kata1 = {
        "adatno": "adat",
        "admnistrasi": "administrasi",
        "ahali": "ahli",
        "agutus": "agustus",
        "asset": "aset",
        "bantenh": "banten",

Example #23

0

Show file

import pandas
import pickle as pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from dateutil.parser import parse
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory , ArrayDictionary , StopWordRemover

factory = StopWordRemoverFactory()
a = list(factory.get_stop_words())
if "di" in a: a.remove("di")
if "adalah" in a: a.remove("adalah")    
dictionary = ArrayDictionary(a)
stopwordId = StopWordRemover(dictionary)

sf= StemmerFactory()
stemmerId = sf.create_stemmer() 

def date_detection(doc,fuzzy=True):
    try: 
        parse(doc, fuzzy=fuzzy)
        return True

    except ValueError:
        return False
    except :
        return False
    
def all_caps_detection(doc):

Example #24

0

Show file

def api_echo():
    if request.method == 'POST':

        # create stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        factory = StopWordRemoverFactory()

        more_stopword = []
        # add stopword
        with open('dataset/stopword.csv') as csvfile:
            readCSV = csv.reader(csvfile, delimiter=',')
            for row in readCSV:
                more_stopword.append(row[0])

        dictionary = ArrayDictionary(more_stopword)
        str = StopWordRemover(dictionary)

        newsTrainer = Trainer(tokenizer)

        kesehatan = []
        konsultasi = []
        marketing = []

        with open("dataset/kesehatan.txt", "r") as ins:
            for line in ins:
                kesehatan.append({
                    'text': line.rstrip(),
                    'category': 'kesehatan'
                })

        with open("dataset/konsultasi.txt", "r") as ins:
            for line in ins:
                konsultasi.append({
                    'text': line.rstrip(),
                    'category': 'konsultasi'
                })

        with open("dataset/marketing.txt", "r") as ins:
            for line in ins:
                marketing.append({
                    'text': line.rstrip(),
                    'category': 'marketing'
                })

        # You need to train the system passing each text one by one to the trainer module.
        newsSet = kesehatan + konsultasi + marketing

        for news in newsSet:
            newsTrainer.train(news['text'], news['category'])

        # When you have sufficient trained data, you are almost done and can start to use
        # a classifier.
        newsClassifier = Classifier(newsTrainer.data, tokenizer)

        query = request.form['query'].encode("utf8")
        #query = "Apa saja level bonus yang didapat bagi seorang agen?"

        # stemming and remove stop word on Query
        out = stemmer.stem(query)
        out = str.remove(out)
        classification = newsClassifier.classify(out)

        # the classification variable holds the detected categories sorted
        #return classification[0][0]
        return jsonify(classification)