コード例 #1
0
class Stemmer:
    def __init__(self):
        self.stemmer()
        self.stopword()

    def stemmer(self):
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()

    def stopword(self):
        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = ['diatur', 'perjodohan', 'dengan', 'ia', 'bahwa', 'oleh', 'nya']
        data = stop_factory + more_stopword
         
        stop_factory = StopWordRemoverFactory()
        dictionary = ArrayDictionary(data)
        self.stopword = StopWordRemover(dictionary)

    def stem(self, sentence = None):       
        sentence = self.stemmer.stem(sentence)

        return sentence

    def remove(self, sentence = None):
        sentence = self.stopword.remove(sentence)

        return sentence
コード例 #2
0
def removeStopWord(query):
    factory = StopWordRemoverFactory().get_stop_words()
    more_stopword = ['!', '.', ',', '?']
    data = factory + more_stopword
    dic = ArrayDictionary(data)
    stopword = StopWordRemover(dic)
    return stopword.remove(query)
コード例 #3
0
ファイル: utils.py プロジェクト: yighu/ML_Project
def stop_stem_remover(kalimat):
    """
    membersihkan stop word dan melakukan seemming

    input : 
    kalimat : kalimat di dalam corpus

    return
    kalimat
    """
    #buang kata tidak terlalu penting
    #factory = StopWordRemoverFactory()
    stop_factory = StopWordRemoverFactory().get_stop_words()
    add_stop_word = ['dkk', 'et', 'al', 'all'] #tambah manual stopwords
    stop = stop_factory + add_stop_word
    dicts = ArrayDictionary(stop)

    all_stop = StopWordRemover(dicts)
    kalimat = all_stop.remove(kalimat)

    #stemming (menjadi kata dasar)
    stemmerFactory = StemmerFactory()
    stemmer = stemmerFactory.create_stemmer()

    kalimat = stemmer.stem(kalimat)
    return kalimat
コード例 #4
0
    def _processTweet(self, tweet):
        punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~'''
        tweet = tweet.lower()  # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',
                       tweet)  # remove URLs
        tweet = re.sub('@[^\s]+', '', tweet)  # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
        tweet = "".join(
            (char for char in tweet if char not in string.punctuation))
        tweet = re.sub('\s+', ' ', tweet).strip()
        tweet = re.sub(r"\d", "", tweet)
        # Ambil Stopword bawaan
        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = open("stopword.txt", "r").read().split()
        # Merge stopword
        data = stop_factory + more_stopword
        dictionary = ArrayDictionary(data)
        str = StopWordRemover(dictionary)

        factory1 = StemmerFactory()  #stemming factory
        stemmer = factory1.create_stemmer()  #buat stemming
        #
        tweet = str.remove(tweet)
        # tweet = stemmer.stem(tweet)  # stemming tweet
        tweet = word_tokenize(
            tweet)  # remove repeated characters (helloooooooo into hello)
        # return [word for word in tweet if word not in self._stopwords]
        return tweet
コード例 #5
0
 def Stopword(doc):
     stop_factory = StopWordRemoverFactory().get_stop_words()
     more_stopword = ['ini', 'itu', 'the']
     data = stop_factory + more_stopword
     dictionary = ArrayDictionary(data)
     data_str = StopWordRemover(dictionary)
     dokumen = data_str.remove(doc)
     return dokumen
コード例 #6
0
ファイル: BM.py プロジェクト: pandyakaa/Simple-ChatBot
def generateStopWords(pat, txt):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    more_stopwords = [' ?', '?', ' .', '.', ' ,', ',']
    # Merge stopword
    data = stop_factory + more_stopwords

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    temppat = str.remove(pat)
    if (temppat == '' or temppat == None):
        temppat = pat

    temptxt = str.remove(txt)
    if (temptxt == '' or temptxt == None):
        temptxt = txt

    return temppat, temptxt
コード例 #7
0
    def __stopward_removal(self, tokens):
        stop_factory = StopWordRemoverFactory().get_stop_words()

        more_stopword = ['dong', 'atuh', 'plis']

        data = stop_factory + more_stopword

        dictionary = ArrayDictionary(data)

        str_remove = StopWordRemover(dictionary)

        tokens = word_tokenize(str_remove.remove(' '.join(tokens)))

        return tokens
コード例 #8
0
def stopword(text):
    # Ambil Stopword bawaan
    stop_factory = StopWordRemoverFactory().get_stop_words()
    print(stop_factory)
    more_stopword = ['diatur', 'perjodohan']

    # Merge stopword
    data = stop_factory + more_stopword

    dictionary = ArrayDictionary(data)
    str = StopWordRemover(dictionary)

    hasil = str.remove(text)
    # print(hasil)

    return hasil
コード例 #9
0
 def preprocess_sentence(self, q=""):
     #tokenize, lower, stopword,stem
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(dictionary)
     factory = StemmerFactory()
     stemmer = factory.create_stemmer()
     tokenizer = RegexpTokenizer(r'\w+')
     res = " ".join(tokenizer.tokenize(q))
     res = res.lower()
     res = stopword.remove(res)
     res = factory = stemmer.stem(res)
     return res
コード例 #10
0
def pre_processing(text):
    stopwords = pd.read_csv('stopwordbahasa.csv', names=['stopword'])['stopword'].tolist()

    stem = StemmerFactory() 
    stemmer = stem.create_stemmer()
    factory = StopWordRemoverFactory()
    stopword = StopWordRemover(ArrayDictionary(factory.get_stop_words() + stopwords))

    clean_str = text.lower() # lowercase
    clean_str = re.sub(r"(?:\@|#|https?\://)\S+", " ", clean_str) # eliminate username, url, hashtags
    clean_str = re.sub(r'&amp;', '', clean_str) # remove &amp; as it equals &
    clean_str = re.sub(r'[^\w\s]',' ', clean_str) # remove punctuation
    clean_str = re.sub('[\s\n\t\r]+', ' ', clean_str) # remove extra space
    clean_str = clean_str.strip() # trim
    clean_str = " ".join([stemmer.stem(word) for word in clean_str.split()]) # stem
    clean_str = stopword.remove(clean_str) # remove stopwords
    return clean_str
コード例 #11
0
 def remove_stopwords(self,
                      csv_src="",
                      csv_dest="",
                      cols_to_clean=["KOMPETENSI"],
                      sep=";"):
     #factory = StopWordRemoverFactory()
     default_stopwords = StopWordRemoverFactory().get_stop_words()
     additional_stopwords = [
         "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu",
         "minggu"
     ]
     dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
     stopword = StopWordRemover(
         dictionary
     )  #factory.create_stop_word_remover(dictionary = dictionary)
     tokenizer = RegexpTokenizer(r'\w+')
     df = pd.read_csv(csv_src, sep=sep)
     for c in cols_to_clean:
         df[c] = df[c].map(lambda x: " ".join(tokenizer.tokenize(x))
                           )  #get only words without symbols
         df[c] = df[c].map(lambda x: stopword.remove(x))  #remove stop words
     df.to_csv(csv_dest, sep=sep, index=None)
     print("lower %d rows" % len(df))
コード例 #12
0
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

    , f1_m, precision_m, recall_m
label = to_categorical(label)
print(label)
print(label.shape)
data2 = []
for x in data:
    # casefolding
    x2 = x.lower()
    # punctuation proccess
    bersih = x2.translate(str.maketrans("","",string.punctuation))
    # stoppword use PYsastrawi
    stop = stopword.remove(bersih)
    # steeming use PYsastrawi
    katadasar = stemmer.stem(stop)
    # insert to another or new list with name data2
    data2.append(katadasar)
    # ready to proccess

X_train, X_test, y_train, y_test = train_test_split(
    data2,
    label,
    test_size=0.2,
    random_state=1)

# fit hanya berdasarkan data train
tokenizer.fit_on_texts(X_train)
# konversi train
コード例 #13
0
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    #filtered_words = [stop_word.remove(words)]
    return (" ".join(words)).strip()


clean_tweet_texts = []
for i in range(len(df_t)):
    clean_tweet_texts.append(tweet_cleaner(df_t[i]))

clean_tweet = []
for i in range(len(clean_tweet_texts)):
    clean_tweet.append(swr.remove(clean_tweet_texts[i]))

#load vocabulary
df['text'] = clean_tweet
v_pkl = 'df_text.pkl'
v_open = open(v_pkl, 'rb')
vocab = pickle.load(v_open)

#IMport TFid Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
Tfidf = TfidfVectorizer()

#vector vocabulary
vector = Tfidf.fit_transform(vocab)

#vector prediski
コード例 #14
0
ファイル: tes.py プロジェクト: resaUndefined/machine_learning
    "Kepada, Yth. Pemerintah Provinsi DKI Jakarta, PT.Metaliska yang telah mendirikan bangunan di atas tanah bantaran kali (melanggar GSK) 300 m dari terminal bus Pulogadung arah ke Timur, mohon ditindak Terima kasih"
]
# kalimat = "Kepada, Yth. Pemerintah Provinsi DKI Jakarta, PT. Metaliska yang telah mendirikan bangunan di atas tanah bantaran kali (melanggar GSK) 300 m dari terminal bus Pulogadung arah ke Timur, mohon ditindak Terima kasih"
kalimat = input("Masukkan Kalimat : ")
print('==================================')
print('Kalimat awal/asli : ')
print(str(kalimat) + '\n')
print("1. hasil proses case folding :")
print(kalimat.lower())
print('\n')
print('2. hasil proses menghilangkan tanda baca/filtering :')
hasil = kalimat.lower().translate(str.maketrans("", "", string.punctuation))
print(hasil)
print('\n')
print("3. hasil proses stopping word : ")
stop = stopword.remove(hasil)
print(stop)
print('\n')
print('4. hasil proses steeming : ')
katadasar = stemmer.stem(stop)
print(katadasar)
print('\n')
print('5. hasil proses tokenisasi : ')
token = [katadasar]
tokenizer.fit_on_texts(token)
seq = tokenizer.texts_to_sequences(token)

print(tokenizer.word_index)
print(seq)

# print(kalimat.lower())
コード例 #15
0
class TweetProcessing():
    user_pattern = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)'
    url_pattern = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})'
    url_pattern2 = r'https://t.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,}'
    #url_pattern = '^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$'
    digit_pattern = r'^\d+\s|\s\d+\s|\s\d+$'
    rt_pattern = r'RT\s:*'
    additional_stopwords = [
        'cc ', 'cc:', 'cc.', 'a', 'd', 'g', 'e', 'y', 'ga', 'gmn', 'tdk',
        'nah', 'sih', 'blm', 'ni', 'di', 'sy', 'sya', 'rt', 'jl', 'jl.', 'jln',
        'jln.', 'no', 'no.', 'dlm', 'tx', 'thx', 'he', 'd', 'k', 'sm'
    ]

    def __init__(self, tweet):
        self.tweet = tweet
        stop_factory = StopWordRemoverFactory().get_stop_words()
        stop_factory = stop_factory + self.additional_stopwords
        dictionary = ArrayDictionary(stop_factory)
        self.strword = StopWordRemover(dictionary)

    def set_tweet(self, tweet):
        self.tweet = tweet

    def get_tweet(self, tweet):
        return self.tweet

    def clean_up_tweet_usernames(self):
        return re.sub(self.user_pattern, '', self.tweet)

    def clean_up_tweet_url(self):
        self.tweet = re.sub(self.url_pattern, '', self.tweet)
        self.tweet = self.tweet.replace("https://t.?", '')
        self.tweet = self.tweet.replace("https://t?", '')
        self.tweet = self.tweet.replace("https://?", '')
        return re.sub(self.url_pattern2, '', self.tweet)

    def clean_up_tweet_rt(self):
        return re.sub(self.rt_pattern, '', self.tweet)

    def clean_up_tweet_digits(self):
        self.tweet = ''.join([i for i in str(self.tweet) if not i.isdigit()])
        return self.tweet
        #return re.sub(self.digit_pattern,'', self.tweet)

    def remove_stop_words(self):
        self.tweet = self.strword.remove(self.tweet)
        return self.tweet

    def stemming_tweet(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        self.tweet = stemmer.stem(self.tweet)
        return self.tweet

    def clean_up_tweet(self):
        self.tweet = self.tweet.lower()
        #self.tweet = self.clean_up_tweet_usernames()
        self.tweet = self.clean_up_tweet_url()
        self.tweet = self.clean_up_tweet_rt()
        self.tweet = self.clean_up_tweet_digits()
        #self.tweet = self.tweet.replace('.',' ')
        self.tweet = self.tweet.replace(',', ' ')
        self.tweet = self.tweet.replace('?', '')
        self.tweet = self.tweet.replace('  ', ' ')
        self.tweet = self.stemming_tweet()
        self.tweet = self.remove_stop_words()
        self.tweet = self.tweet.translate(string.punctuation)
        if self.tweet.startswith('"') and self.tweet.endswith('"'):
            self.tweet = self.tweet[1:-1]

        return self.tweet
コード例 #16
0
class TextSummarization():
    def __init__(self):
        with open('./stopwords.txt') as f:
            more_stopword=f.read().split('\n')
        
        SWfactory = StopWordRemoverFactory()
        stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words())
        self.stopword = StopWordRemover(stopword_data)

    def Preprocessing(self, text):        
        clean = re.sub("#[^\W]+|@[^\W]+|http[^*\s]+|<[^>]*>|[0-9]", '', text) #cleansing data
        emoticons = re.findall('(?::|;|=)()(?:-)?(?:\)|\(|D|P)', clean)
        text = (re.sub('[\W]+', ' ', clean.lower()) +  #Case folding
                ' '.join(emoticons).replace('-', ''))
        result=''
        for kata in text.split(): 
            stop = self.stopword.remove(kata) #Stopword 
            result += f"{stop} " if stop else ''
        return result

    def Summary(self, doc, preprocess=False):
        doc_tokenizer = PunktSentenceTokenizer()
        sentences_list = doc_tokenizer.tokenize(doc)

        clean_sentences_list=[] 
        for sentence in sentences_list:
            clean_sentences_list.append(self.Preprocessing(sentence)) 

        cv = CountVectorizer()
        cv_matrix = cv.fit_transform(clean_sentences_list if preprocess else sentences_list)
        normal_matrix = TfidfTransformer().fit_transform(cv_matrix)

        tfidf=normal_matrix.toarray()
        res_graph = normal_matrix * normal_matrix.T # similaritas /adjacency matrix

        nx_graph= from_scipy_sparse_matrix(res_graph)
        pageranks = pagerank(nx_graph)

        sentence_array = sorted(((pageranks[i], s) for i, s in enumerate(sentences_list)), reverse=True)
        sentence_array = np.asarray(sentence_array)

        rank_max = float(sentence_array[0][0])
        rank_min = float(sentence_array[len(sentence_array) - 1][0])

        temp_array = []

        # Jika semua rank sama
        # taking any sentence will give the summary, say the first sentence
        flag = 0
        if rank_max - rank_min == 0:
            temp_array.append(0)
            flag = 1

        # If the sentence has different ranks
        if flag != 1:
            for i in range(0, len(sentence_array)):
                temp_array.append((float(sentence_array[i][0]) - rank_min) / (rank_max - rank_min))
        
        # Calculation of threshold:
        # We take the mean value of normalized scores
        # any sentence with the normalized score 0.2 more than the mean value is considered to be 
        threshold = (sum(temp_array) / len(temp_array))# + 0.2

        # Separate out the sentences that satiasfy the criteria of having a score above the threshold
        sentence_list = []
        if len(temp_array) > 1:
            for i in range(0, len(temp_array)):
                if temp_array[i] > threshold:
                        sentence_list.append(sentence_array[i][1])
        else:
            sentence_list.append(sentence_array[0][1])

        summary = " ".join(str(x) for x in sentence_list)

        return summary
コード例 #17
0
def api_echo():
    if request.method == 'POST':

        # create stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        factory = StopWordRemoverFactory()

        more_stopword = []
        # add stopword
        with open('dataset/stopword.csv') as csvfile:
            readCSV = csv.reader(csvfile, delimiter=',')
            for row in readCSV:
                more_stopword.append(row[0])

        dictionary = ArrayDictionary(more_stopword)
        str = StopWordRemover(dictionary)

        newsTrainer = Trainer(tokenizer)

        kesehatan = []
        konsultasi = []
        marketing = []

        with open("dataset/kesehatan.txt", "r") as ins:
            for line in ins:
                kesehatan.append({
                    'text': line.rstrip(),
                    'category': 'kesehatan'
                })

        with open("dataset/konsultasi.txt", "r") as ins:
            for line in ins:
                konsultasi.append({
                    'text': line.rstrip(),
                    'category': 'konsultasi'
                })

        with open("dataset/marketing.txt", "r") as ins:
            for line in ins:
                marketing.append({
                    'text': line.rstrip(),
                    'category': 'marketing'
                })

        # You need to train the system passing each text one by one to the trainer module.
        newsSet = kesehatan + konsultasi + marketing

        for news in newsSet:
            newsTrainer.train(news['text'], news['category'])

        # When you have sufficient trained data, you are almost done and can start to use
        # a classifier.
        newsClassifier = Classifier(newsTrainer.data, tokenizer)

        query = request.form['query'].encode("utf8")
        #query = "Apa saja level bonus yang didapat bagi seorang agen?"

        # stemming and remove stop word on Query
        out = stemmer.stem(query)
        out = str.remove(out)
        classification = newsClassifier.classify(out)

        # the classification variable holds the detected categories sorted
        #return classification[0][0]
        return jsonify(classification)