Example #1
0
def pre_processing(doc):
	kata = ""
	datas ={}

	#stemming Sastrawi
	factory = StemmerFactory()
	stemmer = factory.create_stemmer()

	#proses Stopword removal dan tokenisasi
	for index, kalimat in enumerate(doc):
		data = []
		dataku=[]
		#membuat kalimat menjadi token/terpisah menggunakan NLTK
		tokenisasi = nltk.word_tokenize(kalimat)
		# stopWords = nltk.corpus.stopwords.words('english') + ['yang','dengan']
		# memanggil corpus daftar kalimat yang akan dihapus dari file stopwords.txt
		stopwords = open('stopwords.txt', 'r').read().split()
		for idx, word in enumerate(tokenisasi):
			# jika kata dalam komentar tidak dalam corpus stopwords.txt
			if word not in stopwords:
				# maka kata dimasukkan kedalam data
				kata = " "+word
				data.append(stemmer.stem(kata))
		datas[index] = " ".join(data)
		dataku=" ".join(data)
		# jika kata ada dalam stopwords.txt, maka kata dihapus atau dikosongkan
		kata = ""
		file = open("komentar_bersih.txt", "a")
		file.write("%s\n" %dataku)
		file.close()
	# membuat file untuk menyimpan data komentar yang sudah bersih
	# file = open("komentar_bersih.json", "w")
	# file.write("%s\n" %datas)
	# file.close()
	return datas
    def test_fungsional(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        sentence = 'malaikat-malaikat-Nya'
        expected = 'malaikat'
        output = stemmer.stem(sentence)

        if output != expected:
            raise AssertionError(str.format('output is {} instead of {}', output, expected))
Example #3
0
    def post(self):
        data = json.loads(self.request.body)
        text = data['text'].encode('utf8')

        # create stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        # stemming process
        output   = stemmer.stem(text)

        self.response.out.write(json.dumps({'output': output}))
Example #4
0
def input(sentence):
    #parse all sentence
    #put variable on the intent
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    #instrukdi di stem terlebih dahulu
    sentence = stemmer.stem(sentence)

    #lalu di parse untuk mencari entity
    reply = get_entity(sentence)

    if reply == "{'name':'None','followup':'None','prompt':'oke'}":

        process()

    if debug: print("langsung dari input:", sentence)
    if debug: print("Reply:", reply)

    #kalau entities sudah lengkap. lekas proses

    return reply
 def cleansingData(self):
     NewsData = self.Data.drop(columns=['sumber', 'link', 'created_at'])
     NewsData['content'] = NewsData['content'].str.lower()
     # untuk menghapus apapun selain text
     NewsData['content'] = NewsData['content'].str.replace(
         "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([0-9])", "")
     #step tokenize
     NewsData['content'] = NewsData['content'].apply(nltk.word_tokenize)
     #Mengapus Stopword
     data = pd.read_csv(
         "C:\\Users\\eBdesk\\Documents\\Untitled Folder\\indonesian_stopword.txt"
     )
     data['\'\'']
     NewsData['content'] = NewsData['content'].apply(
         lambda x: [y for y in x if y not in data['\'\''].tolist()])
     NewsData['content'] = NewsData['content'].str.join(" ")
     #tfidf untuk no stemming
     if (self.token is None):
         vectorizer2 = TfidfVectorizer(stop_words=None, tokenizer=None)
         tfidf_wm = vectorizer2.fit_transform(NewsData['content'])
         word_features2 = vectorizer2.get_feature_names()
         return pd.DataFrame(tfidf_wm.toarray(),
                             columns=vectorizer2.get_feature_names())
     elif (self.token == "true"):
         list1 = []
         factory = StemmerFactory()
         stemmer = factory.create_stemmer()
         stm_tfidf = NewsData
         for index, row in NewsData.iterrows():
             res = stemmer.stem(row['content'])
             list1.append(res)
         stm_tfidf['content'] = list1
         vectorizer2 = TfidfVectorizer(stop_words=None, tokenizer=None)
         tfidf_wm = vectorizer2.fit_transform(stm_tfidf['content'])
         word_features2 = vectorizer2.get_feature_names()
         return pd.DataFrame(tfidf_wm.toarray(),
                             columns=vectorizer2.get_feature_names())
     else:
         return None
Example #6
0
def Preprocessing(data):
    cleanData = []
    tokenizer = RegexpTokenizer(r'\w+')
    factory_stopwords = StopWordRemoverFactory()
    stopwords = factory_stopwords.get_stop_words()
    factory_stemmer = StemmerFactory()
    stemmer = factory_stemmer.create_stemmer()
    count = 0
    for i in range(len(data)):
        lowerText = data[i].lower()#Case folding
        tokenizedText = tokenizer.tokenize(lowerText)#Punctual removal and tokenization
        swRemovedText = []#Stopwords removal
        for j in range(len(tokenizedText)):
            if tokenizedText[j] not in stopwords:
                swRemovedText.append(tokenizedText[j])
        stemmedText = []
        for k in range(len(swRemovedText)):#Stemming
            stemmedText.append(stemmer.stem(swRemovedText[k]))
        cleanData.append(stemmedText)
        count += 1
        print(count, "data cleaned")
    return cleanData
Example #7
0
def input(sentence):
    global awake
    #parse all sentence
    #put variable on the intent
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    #instrukdi di stem terlebih dahulu
    sentence = stemmer.stem(sentence)

    #lalu di parse untuk mencari entity

    print("THE SENTENCE IS:", sentence)
    if not cek_awake(sentence, wakeupword=wakeupword):
        process(sentence)
        reply = {
            'name': 'informasi',
            'followup': 'awake',
            'method': 'ask',
            'type': 'string',
            'required': True,
            'value': 'None',
            'prompt': 'ss'
        }
    else:
        reply = {
            'name': 'None',
            'followup': 'None',
            'prompt': 'Saya sudah bangun'
        }
        awake = True

    if debug: print("langsung dari input:", sentence)
    if debug: print("Reply:", reply)

    #kalau entities sudah lengkap. lekas proses

    return reply
    def __init__(self):
        # init NLP
        self.nlp = Indonesian()

        # init flash text
        self.keyword_processor_slang_word = KeywordProcessor()
        self.keyword_processor_emoticon = KeywordProcessor()
        self.keyword_processor_meaning_text = KeywordProcessor()

        # init stemmer
        self.stemmer = StemmerFactory().create_stemmer()

        self.__init_flash_text_corpus()
        self.__init_custom_stop_word()
Example #9
0
 def __init__(self, input, file_location):
     data = self.dataFromFile(file_location)
     stopword = StopWordRemoverFactory().create_stop_word_remover()
     stemmer = StemmerFactory().create_stemmer()
     input = stopword.remove(input.lower())
     input = stemmer.stem(input)
     valid = 0
     for i in range(len(data)):
         kal = stopword.remove(data[i][0].lower())
         kal = stemmer.stem(kal)
         if (self.bm(input.lower(), kal.lower()) != -1):
             if (valid == 0):
                 percent = len(input) * 100 / len(kal)
                 # print("Confidence1 : " + str(percent))
                 if (percent > 80):
                     self.answere = data[i][1]
                 valid = 1
         else:
             if valid == 0:
                 if (self.bm2(input.lower(), kal.lower()) >= 80):
                     # print("Confidence2 : " + str(bm2(input.lower(), kal.lower())))
                     self.answere = data[i][1]
                     valid = 1
Example #10
0
def stemming(doc):
    """
    fungsi ini digunakan untuk mencari kata dasar berdasarkan gejala
    :param doc: inputan hasil filtering
    :return: list stem berisi kata dasar dari hasil filtering
    """

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stem = []

    len_array = len(doc)
    for i in range(len_array):
        temp = doc[i]
        if doc[i] == 'menelan':
            result_stem = 'nelan'
        elif doc[i] == 'perasaan':
            result_stem = 'rasa'
        else:
            result_stem = stemmer.stem(temp)
        stem.append(result_stem)

    return stem
def text_preprocessing(document):
    # pada bagian ini digunakan untuk casefolding
    caseFolding = str(document).lower()

    # ***ini digunakan untuk tokenisasi, disini saya menggunakan regex untuk
    # membersihkan tanda baca yang sekiranya mengganggu ***
    tokenization = re.findall(r"[\w']+", caseFolding)

    # proses stopwords removal dimulai dari sini
    file = open('stopword_tala.txt',
                'r')  # disini saya membuka dokumen stopword tala
    stopWordsList = file.read(
    )  # kemudian membacanya dan disimpan pada variable stopwordsList
    hasilStopwords = [
    ]  # saya membuat sebuah list kosong yang nantinya akan disimpan sebuah hasil dari stopwords

    for w in tokenization:  # melakukan perulangan variable w pada hasil tokenisasi
        if w not in stopWordsList:  # memberikan seleksi kondisi jika nilai w tidak ada dalam stopwordsList
            hasilStopwords.append(
                w
            )  # nilai dari w yang sudah diseleksi pada baris sebelumnya akan dimasukkan ke variable hasilStopwords
    # Stopword Removal
    # removeDuplicate = list(
    #     dict.fromkeys(hasilStopwords))  # menghapus duplikasi kata dalam list pada variable hasilStopwords

    string = " "  # membuat sebuah varible kosong bernama string dengan tipe string juga yang nantinya akan digunakan untuk mengkonversi list ke string
    stopwordListToString = string.join(
        hasilStopwords)  # menggabungkan string pada list

    # membuat stemmer dari library pySastrawi
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    # proses stemming yang digunakan
    hasilStemming = stemmer.stem(stopwordListToString)

    return hasilStemming
Example #12
0
 def pengecekanKBBI(self, daftar_kata):
     for hasil in daftar_kata:
         kata = hasil[0].lower()
         hasil_kata = self.cek_KBBI(kata)
         if hasil_kata is not None:
             if hasil_kata['phrase_type'] is None:
                 print(hasil_kata['phrase'], "digunakan ", hasil[1],
                       "kali, ")
                 print("Tidak baku, harusnya ", hasil_kata['actual_phrase'])
             else:
                 print(hasil_kata['phrase'], "digunakan ", hasil[1],
                       "kali, ")
                 print("Ok")
         else:
             # print(kata, "bukan kata yang benar ")
             # Check Typo
             factory = StemmerFactory()
             stemmer = factory.create_stemmer()
             # Stem Kata
             kata_stem = stemmer.stem(kata)
             print('Hasil stem : ', kata_stem)
             # Cek lagi
             hasil_kata = self.cek_KBBI(kata_stem)
             if hasil_kata is not None:
                 if hasil_kata['phrase_type'] is None:
                     print(hasil_kata['phrase'], "digunakan ", kata,
                           "kali, ")
                     print("Tidak ini baku, harusnya ",
                           hasil_kata['actual_phrase'])
                 else:
                     print(hasil_kata['phrase'], "adalah kata yang benar, ",
                           kata, "digunakan ", hasil[1], "kali, ")
                     print("Ok")
             else:
                 print(kata, "bukan kata yang benar, kata ini digunakan ",
                       hasil[1], "kali, ")
         print("\n")
Example #13
0
def cleaning_data(data_test):  
    ##Lower_case
    lower_case = data_test.str.lower()

    ##Number removal
    num_removal = lower_case.str.replace('\d+', '')

    ##symbol removal
    sym_removal = num_removal.str.replace('[^\w\s]','')

    ##whitespace removal
    white_removal = sym_removal.str.strip()

    ##Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stem = [stemmer.stem(basic_word) for basic_word in white_removal]

    ##tokenization
    token = [word_tokenize(text)for text in stem]

    ##stopword
    liststopword = set (stopwords.words('indonesian'))

    kl = []
    for text_stop in token:
      new = []
      for x in text_stop:
        if x not in liststopword: 
          new.append(x)

      kl.append(str(new))

    cleaned_data = pd.DataFrame(kl)

    return cleaned_data
Example #14
0
def normalisasi2(pos_texts, neg_texts, kamus_hasil):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stopwords = get_stopwords()

    pos_texts_normalized = []

    for text in pos_texts:
        pos_text_normalized = []

        for word in text.split():
            # normalisasi
            word = kamus_hasil[word]
            if word not in stopwords:
                word = stemmer.stem(word)
                if word not in stopwords:
                    pos_text_normalized.append(word)
        
        pos_texts_normalized.append(' '.join(pos_text_normalized))
    

    neg_texts_normalized = []

    for text in neg_texts:
        neg_text_normalized = []

        for word in text.split():
            # normalisasi
            word = kamus_hasil[word]
            if word not in stopwords:
                word = stemmer.stem(word)
                if word not in stopwords:
                    neg_text_normalized.append(word)
        
        neg_texts_normalized.append(' '.join(neg_text_normalized))

    return pos_texts_normalized, neg_texts_normalized
def predict_news_title():
    title_args = request.args.get('q')
    sw_remover = StopWordRemoverFactory().create_stop_word_remover()
    stemmer = StemmerFactory().create_stemmer()
    vectorizer = pickle.load(open("vectorizer.pickle", "rb"))
    model = pickle.load(open("final_model.pickle", "rb"))
    title_preprocessed = preprocess(title_args, sw_remover, stemmer)
    title = vectorizer.transform([title_preprocessed])
    predicted_label = model.predict(title)[0]
    result = {
        'title': title_args,
        'title_cleaned': title_preprocessed,
        'predicted_label': predicted_label
    }
    return make_response(jsonify(result), 200)
Example #16
0
def preprocessing(input_path=None): #, stopword=stopword, stemmer=stemmer):
    factori = StemmerFactory()
    stemmer = factori.create_stemmer()

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()

    tokenizer = RegexpTokenizer(r'\w+')

    arr_praproses = list()
    with open (input_path, 'r',  encoding='"ISO-8859-1"') as input :
        reader = input.read().split("\n")               #Akses data per baris
        for indeks in range(len(reader)):
            lowcase_word = reader[indeks].lower()       #case folding lowcase data perbaris
            stopw = stopword.remove(lowcase_word)       #uncomment jika pakai stopword removal
            stemming = stemmer.stem(stopw)              #uncomment jika pakai stemming
            tokens = tokenizer.tokenize(stemming)       #Tokenisasi Kalimat, tergantung proses terakhirnya, stemming atau stopword atau hanya casefolding
            output = list()       
            for kata in tokens:
                output.append(kata)                     #proses stemming per-kata dalam 1 kalimat
            sentence = " ".join(output) + ''
            arr_praproses.append(sentence)                #tampung kalimat hasil stemm ke arr_praproses
    
    return arr_praproses
def term_in_documents_frequency(text_sentences, dict, queries, is_indonesian):
    if(is_indonesian):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
    else:
        stemmer = PorterStemmer()
    frequency_matrix = {}
    for docnum, sentences in text_sentences.items():
        freq_table = {}
        for sent in sentences:
            words = word_tokenize(sent)
            for word in words:
                word = word.lower()
                word = stemmer.stem(word)
                if word in dict:
                    continue
                if word in queries:
                    if word in freq_table:
                        freq_table[word] += 1
                    else:
                        freq_table[word] = 1
        frequency_matrix[docnum] = freq_table

    return frequency_matrix
Example #18
0
def stemming_words(words):
    # Import Library
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()

    # Lakukan Stemming
    hasil1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',
                    words)
    hasil2 = hasil1.encode('ascii', 'ignore').decode('ascii')
    hasil3 = ' '.join(word for word in hasil2.split(' ')
                      if not word.startswith('#'))
    hasil4 = ' '.join(word for word in hasil3.split(' ')
                      if not word.startswith('@'))
    katadasar = stemmer.stem(str(hasil4))
    #hapus stopword/hapus kata dasar dengan menggunakan metode sastrawi
    stop = stopword.remove(katadasar)
    hasil5 = (" ".join(stop.split()))

    # Kembalikan hasil stemming
    return hasil5
Example #19
0
def cleanTweets(Tweets):
    factory = StopWordRemoverFactory(); stopwords = set(factory.get_stop_words()+['twitter','rt','pic','com','yg','ga','https'])
    factory = StemmerFactory(); stemmer = factory.create_stemmer()
    for i,tweet in enumerate(tqdm(Tweets)):
        txt = tweet['fullTxt'] # if you want to ignore retweets  ==> if not re.match(r'^RT.*', txt):
        txt = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',txt)# clean urls
        txt = txt.lower() # Lowercase
        txt = Tokenizer.tokenize(txt)
        symbols = set(['@']) # Add more if you want
        txt = [strip_non_ascii(t,symbols) for t in txt] #remove all non ASCII characters
        txt = ' '.join([t for t in txt if len(t)>1])
        Tweets[i]['cleanTxt'] = txt # this is not a good Python practice, only for learning.
        txt = stemmer.stem(txt).split()
        Tweets[i]['nlp'] = ' '.join([t for t in txt if t not in stopwords])
    return Tweets
Example #20
0
def sastrawi():
    """
    Load stemming model using Sastrawi, this also include lemmatization.

    Returns
    -------
    result: malaya.stem.Sastrawi class
    """
    try:
        from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    except BaseException:
        raise ModuleNotFoundError(
            'PySastrawi not installed. Please install it by `pip install PySastrawi` and try again.'
        )
    return Sastrawi(StemmerFactory())
class PreprocessUtil:
    """
        collection of preprocessing utility
    """
    __remover = StopWordRemoverFactory().create_stop_word_remover()
    __stemmer = StemmerFactory().create_stemmer()
    
    @staticmethod
    def symbol_remover(text: str) -> str:
        """
            remove symbol from text
            :parameter text: str
            :return: str
            
            example:
                >>> PreprocessUtil.symbol_remover("naufal, afif")
                naufal afif
        """
        
        return text.translate(str.maketrans('','',string.punctuation)).lower()
    
    @classmethod
    def stopword_remover(cls, text: str) -> str:
        """
            remove stopword from text
            :parameter text: str
            :return: str
            
            example:
                >>> PreprocessUtil.stopword_remover("naufal dan afif")
                naufal afif
        """
        
        return cls.__remover.remove(text)
    
    @classmethod
    def stemmer(cls, text: str) -> str:
        """
            replace word with it's root
            :parameter text: str
            :return: str
            
            example:
                >>> PreprocessUtil.stemmer("naufal berlari")
                naufal lari
        """

        return cls.__stemmer.stem(text)
class Preprocessor():
    def __init__(self):
        self.stopwords = StopWordRemoverFactory().get_stop_words()
        self.stemmer = StemmerFactory().create_stemmer()

    def stemming(self, words):
        return self.stemmer.stem(words)

    def tokenizing(self, str, delimiter=" "):
        return str.split(delimiter)

    def preprocess(self, words):
        return [
            token for token in self.tokenizing(self.stemming(words))
            if token not in self.stopwords
        ]
Example #23
0
    def process(text):
        # Normalizing
        _query = text.replace("'", "")

        # ***PRE-PROCESSING***
        # Stopword
        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()
        _query = stopword.remove(_query)

        # Stemming
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        _query = stemmer.stem(_query)
        
        return _query
class PreProcessTweets:
    factory = StopWordRemoverFactory()
    get_stop_words = factory.get_stop_words()
    factory1 = StemmerFactory()
    stemmer = factory1.create_stemmer()

    def __init__(self):
        self._stopwords = StopWordRemoverFactory.get_stop_words(self)

    #  self._stopwords = set(stopwords.words('indonesian') + list(punctuation) + ['AT_USER', 'URL'])
    def processTweets(self, list_of_tweets):
        processedTweets = []
        for tweet in list_of_tweets:
            processedTweets.append(
                (self._processTweet(tweet["text"]), tweet["label"]))
        return processedTweets

    def _processTweet(self, tweet):
        punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~'''
        tweet = tweet.lower()  # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '',
                       tweet)  # remove URLs
        tweet = re.sub('@[^\s]+', '', tweet)  # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
        tweet = "".join(
            (char for char in tweet if char not in string.punctuation))
        tweet = re.sub('\s+', ' ', tweet).strip()
        tweet = re.sub(r"\d", "", tweet)
        # Ambil Stopword bawaan
        stop_factory = StopWordRemoverFactory().get_stop_words()
        more_stopword = open("stopword.txt", "r").read().split()
        # Merge stopword
        data = stop_factory + more_stopword
        dictionary = ArrayDictionary(data)
        str = StopWordRemover(dictionary)

        factory1 = StemmerFactory()  #stemming factory
        stemmer = factory1.create_stemmer()  #buat stemming
        #
        tweet = str.remove(tweet)
        # tweet = stemmer.stem(tweet)  # stemming tweet
        tweet = word_tokenize(
            tweet)  # remove repeated characters (helloooooooo into hello)
        # return [word for word in tweet if word not in self._stopwords]
        return tweet
Example #25
0
def blogging():
    # Insert to Blog DB
    _question = request.form["question"]
    _answer = request.form["answer"]
    faq = FAQRepository().insert(Faq(0, _question, _answer))

    # Normalizing
    _question = _question.replace("'", "")
    _answer = _answer.replace("'", "")

    # ***PRE-PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _question = stopword.remove(_question)
    _answer = stopword.remove(_answer)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _question = stemmer.stem(_question)
    _answer = stemmer.stem(_answer)

    # Get all unique word from question
    blob = tb(_question)
    uniqWord = list(set(blob.words))

    # Count all unique word in question
    sumOfWord = 0
    for word in uniqWord:
        _n = blob.words.count(word)
        sumOfWord += _n

    # Get Average
    average = sumOfWord / len(blob)

    # Get Over Average Word
    for word in uniqWord:
        n = blob.words.count(word)
        if (n > average):
            # Insert to Keyword DB
            KeywordRepository().insert(Keyword(faq.id_faq, word, n))

    return render_template('faq.html')
Example #26
0
class Preprocess:
    def __init__(self):
        self.stemmer = StemmerFactory().create_stemmer()
        self.remover = StopWordRemoverFactory().create_stop_word_remover()

    def preprocess(self, text):
        # # 1 stemming
        text_stem = self.stemmer.stem(text)
        #
        # # 2 hapus stop words
        text_clean = self.remover.remove(text_stem)
        #
        # # 3 tokenization
        # # 3.1 lowercase
        lowercase = text_clean.lower()
        preprocessed_text = lowercase.translate(None,
                                                string.punctuation).split()

        return preprocessed_text
Example #27
0
class Preprocessing():
    stop_words = stopwords.words('indonesian')
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    def initial_clean(self, text):
        """
        Function to clean text of websites, email addresess and any punctuation
        We also lower case the text
        """
        text = re.sub(
            "((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)",
            " ", text)
        text = re.sub("[^a-zA-Z ]", "", text)
        text = text.lower()  # lower case the text
        text = nltk.word_tokenize(text)
        return text

    def remove_stop_words(self, text):
        """
        Function that removes all stopwords from text
        """
        return [word for word in text if word not in self.stop_words]

    def stem_words(self, text):
        """
        Function to stem words, so plural and singular are treated the same
        """
        try:
            text = [self.stemmer.stem(word) for word in text]
            text = [word for word in text if len(word) > 1]
        except IndexError:  # the word "oed" broke this, so needed try except
            pass
        return text

    def preprocess(self, data):
        """
        This function applies all the functions above into one
        """
        # if data == "":
        #     data = self.text
        return self.stem_words(self.remove_stop_words(
            self.initial_clean(data)))
class Preprocessing :
    def __init__(self):
        print("Initializing preprocessing...")
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        pass

    def processtext(self, text):
        text = text.lower()
        text = re.sub(r'\&\w*;', '', text)
        text = re.sub('@[^\s]+','',text)
        text = re.sub(r'\$\w*', '', text)
        text = text.lower()
        text = re.sub(r'https?:\/\/.*\/\w*', '', text)
        text = re.sub(r'#\w*', '', text)
        text = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', text)
        text = re.sub(r'\b\w{1,2}\b', '', text)
        text = re.sub(r'\s\s+', ' ', text)
        text = text.lstrip(' ')
        text = ''.join(c for c in text if c <= '\uFFFF')
        return text

    def stem(self, text):
        text = self.stemmer.stem(text)
        return text

    def remove_stopwords(self, param):
        f = "id_stopwords.txt"
        with open(f, 'r') as my_stopwords:
            stopwords_list = my_stopwords.read()
            list = param.split()
            index = []
            i = 0
            d = ""
            while i < len(list):
                if list[i] not in stopwords_list:
                    index.append(i)
                i += 1
            for k in index:
                d += list[k]+" "
            #s = ' '.join(list)
            return d.strip()
Example #29
0
class Stemmer:
    factory = None
    stemmer = None

    def __init__(self):
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()

    def stem(self, sentence, map_emoticon, map_senti):
        new_sentence = ""
        for word in sentence.split():
            # If it is emoticon
            if word in map_emoticon:
                new_sentence = new_sentence + word + " "
            # If it is a sentiment word
            elif word in map_senti:
                new_sentence = new_sentence + word + " "
            else:
                # Only get alphabet, remove emoji
                if (word.isalpha()):
                    new_sentence = new_sentence + self.stemmer.stem(word) + " "
        return new_sentence
Example #30
0
class SimpleIndonesianPreprocessor(BaseEstimator, TransformerMixin):
    """
    Simple Indonesian text preprocessor
    """
    def __init__(self, stem=True, stopwords=True, verbose=True):
        self.stemmer = StemmerFactory().create_stemmer() if stem else None
        self.stopwords = []
        if stopwords:
            with open(STOPWORDS_FILE, 'r') as f:
                self.stopwords = f.read().splitlines()
        self.verbose = verbose

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        results = []
        if self.verbose:
            print('Preprocessing..')
            bar = progressbar.ProgressBar()
            for doc in bar(X):
                results.append(list(self.tokenize(doc)))
            return results
        else:
            return [list(self.tokenize(doc)) for doc in X]

    def tokenize(self, document):
        if self.stemmer:
            # stem and split by whitespaces
            for token in self.stemmer.stem(document).split():
                if token not in self.stopwords:
                    yield token
        else:
            for token in document.lower().split():
                if token not in self.stopwords:
                    yield token
Example #31
0
class TrainingData:

    nama = []
    factory = StemmerFactory()
    factory_remove_word = StopWordRemoverFactory()
    stemmer = factory.create_stemmer()
    stopword = factory_remove_word.create_stop_word_remover()

    def read_file_data(url_args):
        # file = textract.process(Uri,method="tesseract")
        file = open(url_args, 'rb')

        return file


    def clean_words(words_args,stemmer=stemmer, stopword_args = stopword):

        clean_words = re.sub("[(){}<>\",\-*0-9;']", " ", words_args)
        stemmed_word = stemmer.stem(clean_words)
        output = stopword_args.remove(stemmed_word)

        return output
class FeatureAnnotator:
    def __init__(self):
        self.nlp = stanza.Pipeline("id", use_gpu=False)
        self.stemmer = StemmerFactory().create_stemmer()
        self.ner = get_entities
        # Set POS Tagger
        self.pos_tagger = nltk.tag.CRFTagger()
        self.pos_tagger.set_model_file(
            'pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')

    def annotate(self, sentence):
        annotation = defaultdict(list)
        sentence = sentence.translate(str.maketrans('', '',
                                                    string.punctuation))
        doc = self.nlp(sentence)

        annotation['ner_tags'] = self.ner(sentence)

        word_dict = defaultdict(int)

        for sent in doc.sentences:
            for idx, word in enumerate(sent.words):
                annotation['tokens'].append(word.text)
                stemmed_word = self.stemmer.stem(word.text)
                if (annotation['ner_tags'][idx] in ['PER', 'ORG']):
                    stemmed_word = word.text.lower()
                annotation['lemmas'].append(
                    stemmed_word + '_{}'.format(word_dict[stemmed_word]))
                annotation['dependency'].append(
                    dict(relation=word.deprel, head=word.head))

        annotation['pos_tags'] = [
            tag[1] for tag in self.pos_tagger.tag(annotation['tokens'])
        ]

        return annotation
class Test_StemmerFactoryTest(unittest.TestCase):
    def setUp(self):
        self.factory = StemmerFactory()
        return super(Test_StemmerFactoryTest, self).setUp()

    def test_createStemmerReturnStemmer(self):
        stemmer = self.factory.create_stemmer()
        self.assertIsNotNone(stemmer)
        #self.assertIsInstance(stemmer, Stemmer)

    def test_fungsional(self):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        sentence = 'malaikat-malaikat-Nya'
        expected = 'malaikat'
        output = stemmer.stem(sentence)

        if output != expected:
            raise AssertionError(str.format('output is {} instead of {}', output, expected))

    def test_getWordsFromFile(self):
        factory = StemmerFactory()
        factory.get_words_from_file()
 def setUp(self):
     self.factory = StemmerFactory()
     return super(Test_StemmerFactoryTest, self).setUp()
def rmStem(pars):
	factory = StemmerFactory()
	stripped= strip_tags(pars)
	stemmer = factory.create_stemmer()
	clean   = stemmer.stem(str(stripped)) #Stemming
	return clean
Example #36
0
 def setUp(self):
     stemmerFactory = StemmerFactory()
     self.stemmer = stemmerFactory.create_stemmer()
     return super(Test_StemmerTest, self).setUp()
Example #37
0
File: main.py Project: irs37/nlp
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from collections import Counter

akun = ['548904824', '255409050', '480224156', '63433517', '82552414', '61379637', '79994423', '47251716',
        '260043508']  # ['@IndosatCare','@Telkomsel','@myXLCare','@triindonesia','@myXL','@IM3Ooredoo','@AXISgsm','@ask_AXIS','@simPATI']
kata_kunci = ['lambat', 'lelet', 'lola', 'lemot', 'koneksi', 'gsm', '3g', '4g', 'hsdpa', 'edge', 'jaring', 'ganggu']

cred = credentials.Certificate('kunci2.json')
firebase_admin.initialize_app(cred)

db = firestore.client()
tweet_ref = db.collection('Tweet')
kata_ref = db.collection("kata_kunci")
last_ref = db.collection("lasttweet")

factory = StemmerFactory()
stemmer = factory.create_stemmer()


def tweetstruct(user, text, t):
    data = {
        'username': user,
        'text': text,
        'time': t,
    }
    return data


def storetweet(id, input):
    try:
        ref = tweet_ref.document(id)
 def test_getWordsFromFile(self):
     factory = StemmerFactory()
     factory.get_words_from_file()