def pre_processing(doc): kata = "" datas ={} #stemming Sastrawi factory = StemmerFactory() stemmer = factory.create_stemmer() #proses Stopword removal dan tokenisasi for index, kalimat in enumerate(doc): data = [] dataku=[] #membuat kalimat menjadi token/terpisah menggunakan NLTK tokenisasi = nltk.word_tokenize(kalimat) # stopWords = nltk.corpus.stopwords.words('english') + ['yang','dengan'] # memanggil corpus daftar kalimat yang akan dihapus dari file stopwords.txt stopwords = open('stopwords.txt', 'r').read().split() for idx, word in enumerate(tokenisasi): # jika kata dalam komentar tidak dalam corpus stopwords.txt if word not in stopwords: # maka kata dimasukkan kedalam data kata = " "+word data.append(stemmer.stem(kata)) datas[index] = " ".join(data) dataku=" ".join(data) # jika kata ada dalam stopwords.txt, maka kata dihapus atau dikosongkan kata = "" file = open("komentar_bersih.txt", "a") file.write("%s\n" %dataku) file.close() # membuat file untuk menyimpan data komentar yang sudah bersih # file = open("komentar_bersih.json", "w") # file.write("%s\n" %datas) # file.close() return datas
def test_fungsional(self): factory = StemmerFactory() stemmer = factory.create_stemmer() sentence = 'malaikat-malaikat-Nya' expected = 'malaikat' output = stemmer.stem(sentence) if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected))
def post(self): data = json.loads(self.request.body) text = data['text'].encode('utf8') # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # stemming process output = stemmer.stem(text) self.response.out.write(json.dumps({'output': output}))
def input(sentence): #parse all sentence #put variable on the intent factory = StemmerFactory() stemmer = factory.create_stemmer() #instrukdi di stem terlebih dahulu sentence = stemmer.stem(sentence) #lalu di parse untuk mencari entity reply = get_entity(sentence) if reply == "{'name':'None','followup':'None','prompt':'oke'}": process() if debug: print("langsung dari input:", sentence) if debug: print("Reply:", reply) #kalau entities sudah lengkap. lekas proses return reply
def cleansingData(self): NewsData = self.Data.drop(columns=['sumber', 'link', 'created_at']) NewsData['content'] = NewsData['content'].str.lower() # untuk menghapus apapun selain text NewsData['content'] = NewsData['content'].str.replace( "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([0-9])", "") #step tokenize NewsData['content'] = NewsData['content'].apply(nltk.word_tokenize) #Mengapus Stopword data = pd.read_csv( "C:\\Users\\eBdesk\\Documents\\Untitled Folder\\indonesian_stopword.txt" ) data['\'\''] NewsData['content'] = NewsData['content'].apply( lambda x: [y for y in x if y not in data['\'\''].tolist()]) NewsData['content'] = NewsData['content'].str.join(" ") #tfidf untuk no stemming if (self.token is None): vectorizer2 = TfidfVectorizer(stop_words=None, tokenizer=None) tfidf_wm = vectorizer2.fit_transform(NewsData['content']) word_features2 = vectorizer2.get_feature_names() return pd.DataFrame(tfidf_wm.toarray(), columns=vectorizer2.get_feature_names()) elif (self.token == "true"): list1 = [] factory = StemmerFactory() stemmer = factory.create_stemmer() stm_tfidf = NewsData for index, row in NewsData.iterrows(): res = stemmer.stem(row['content']) list1.append(res) stm_tfidf['content'] = list1 vectorizer2 = TfidfVectorizer(stop_words=None, tokenizer=None) tfidf_wm = vectorizer2.fit_transform(stm_tfidf['content']) word_features2 = vectorizer2.get_feature_names() return pd.DataFrame(tfidf_wm.toarray(), columns=vectorizer2.get_feature_names()) else: return None
def Preprocessing(data): cleanData = [] tokenizer = RegexpTokenizer(r'\w+') factory_stopwords = StopWordRemoverFactory() stopwords = factory_stopwords.get_stop_words() factory_stemmer = StemmerFactory() stemmer = factory_stemmer.create_stemmer() count = 0 for i in range(len(data)): lowerText = data[i].lower()#Case folding tokenizedText = tokenizer.tokenize(lowerText)#Punctual removal and tokenization swRemovedText = []#Stopwords removal for j in range(len(tokenizedText)): if tokenizedText[j] not in stopwords: swRemovedText.append(tokenizedText[j]) stemmedText = [] for k in range(len(swRemovedText)):#Stemming stemmedText.append(stemmer.stem(swRemovedText[k])) cleanData.append(stemmedText) count += 1 print(count, "data cleaned") return cleanData
def input(sentence): global awake #parse all sentence #put variable on the intent factory = StemmerFactory() stemmer = factory.create_stemmer() #instrukdi di stem terlebih dahulu sentence = stemmer.stem(sentence) #lalu di parse untuk mencari entity print("THE SENTENCE IS:", sentence) if not cek_awake(sentence, wakeupword=wakeupword): process(sentence) reply = { 'name': 'informasi', 'followup': 'awake', 'method': 'ask', 'type': 'string', 'required': True, 'value': 'None', 'prompt': 'ss' } else: reply = { 'name': 'None', 'followup': 'None', 'prompt': 'Saya sudah bangun' } awake = True if debug: print("langsung dari input:", sentence) if debug: print("Reply:", reply) #kalau entities sudah lengkap. lekas proses return reply
def __init__(self): # init NLP self.nlp = Indonesian() # init flash text self.keyword_processor_slang_word = KeywordProcessor() self.keyword_processor_emoticon = KeywordProcessor() self.keyword_processor_meaning_text = KeywordProcessor() # init stemmer self.stemmer = StemmerFactory().create_stemmer() self.__init_flash_text_corpus() self.__init_custom_stop_word()
def __init__(self, input, file_location): data = self.dataFromFile(file_location) stopword = StopWordRemoverFactory().create_stop_word_remover() stemmer = StemmerFactory().create_stemmer() input = stopword.remove(input.lower()) input = stemmer.stem(input) valid = 0 for i in range(len(data)): kal = stopword.remove(data[i][0].lower()) kal = stemmer.stem(kal) if (self.bm(input.lower(), kal.lower()) != -1): if (valid == 0): percent = len(input) * 100 / len(kal) # print("Confidence1 : " + str(percent)) if (percent > 80): self.answere = data[i][1] valid = 1 else: if valid == 0: if (self.bm2(input.lower(), kal.lower()) >= 80): # print("Confidence2 : " + str(bm2(input.lower(), kal.lower()))) self.answere = data[i][1] valid = 1
def stemming(doc): """ fungsi ini digunakan untuk mencari kata dasar berdasarkan gejala :param doc: inputan hasil filtering :return: list stem berisi kata dasar dari hasil filtering """ factory = StemmerFactory() stemmer = factory.create_stemmer() stem = [] len_array = len(doc) for i in range(len_array): temp = doc[i] if doc[i] == 'menelan': result_stem = 'nelan' elif doc[i] == 'perasaan': result_stem = 'rasa' else: result_stem = stemmer.stem(temp) stem.append(result_stem) return stem
def text_preprocessing(document): # pada bagian ini digunakan untuk casefolding caseFolding = str(document).lower() # ***ini digunakan untuk tokenisasi, disini saya menggunakan regex untuk # membersihkan tanda baca yang sekiranya mengganggu *** tokenization = re.findall(r"[\w']+", caseFolding) # proses stopwords removal dimulai dari sini file = open('stopword_tala.txt', 'r') # disini saya membuka dokumen stopword tala stopWordsList = file.read( ) # kemudian membacanya dan disimpan pada variable stopwordsList hasilStopwords = [ ] # saya membuat sebuah list kosong yang nantinya akan disimpan sebuah hasil dari stopwords for w in tokenization: # melakukan perulangan variable w pada hasil tokenisasi if w not in stopWordsList: # memberikan seleksi kondisi jika nilai w tidak ada dalam stopwordsList hasilStopwords.append( w ) # nilai dari w yang sudah diseleksi pada baris sebelumnya akan dimasukkan ke variable hasilStopwords # Stopword Removal # removeDuplicate = list( # dict.fromkeys(hasilStopwords)) # menghapus duplikasi kata dalam list pada variable hasilStopwords string = " " # membuat sebuah varible kosong bernama string dengan tipe string juga yang nantinya akan digunakan untuk mengkonversi list ke string stopwordListToString = string.join( hasilStopwords) # menggabungkan string pada list # membuat stemmer dari library pySastrawi factory = StemmerFactory() stemmer = factory.create_stemmer() # proses stemming yang digunakan hasilStemming = stemmer.stem(stopwordListToString) return hasilStemming
def pengecekanKBBI(self, daftar_kata): for hasil in daftar_kata: kata = hasil[0].lower() hasil_kata = self.cek_KBBI(kata) if hasil_kata is not None: if hasil_kata['phrase_type'] is None: print(hasil_kata['phrase'], "digunakan ", hasil[1], "kali, ") print("Tidak baku, harusnya ", hasil_kata['actual_phrase']) else: print(hasil_kata['phrase'], "digunakan ", hasil[1], "kali, ") print("Ok") else: # print(kata, "bukan kata yang benar ") # Check Typo factory = StemmerFactory() stemmer = factory.create_stemmer() # Stem Kata kata_stem = stemmer.stem(kata) print('Hasil stem : ', kata_stem) # Cek lagi hasil_kata = self.cek_KBBI(kata_stem) if hasil_kata is not None: if hasil_kata['phrase_type'] is None: print(hasil_kata['phrase'], "digunakan ", kata, "kali, ") print("Tidak ini baku, harusnya ", hasil_kata['actual_phrase']) else: print(hasil_kata['phrase'], "adalah kata yang benar, ", kata, "digunakan ", hasil[1], "kali, ") print("Ok") else: print(kata, "bukan kata yang benar, kata ini digunakan ", hasil[1], "kali, ") print("\n")
def cleaning_data(data_test): ##Lower_case lower_case = data_test.str.lower() ##Number removal num_removal = lower_case.str.replace('\d+', '') ##symbol removal sym_removal = num_removal.str.replace('[^\w\s]','') ##whitespace removal white_removal = sym_removal.str.strip() ##Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() stem = [stemmer.stem(basic_word) for basic_word in white_removal] ##tokenization token = [word_tokenize(text)for text in stem] ##stopword liststopword = set (stopwords.words('indonesian')) kl = [] for text_stop in token: new = [] for x in text_stop: if x not in liststopword: new.append(x) kl.append(str(new)) cleaned_data = pd.DataFrame(kl) return cleaned_data
def normalisasi2(pos_texts, neg_texts, kamus_hasil): factory = StemmerFactory() stemmer = factory.create_stemmer() stopwords = get_stopwords() pos_texts_normalized = [] for text in pos_texts: pos_text_normalized = [] for word in text.split(): # normalisasi word = kamus_hasil[word] if word not in stopwords: word = stemmer.stem(word) if word not in stopwords: pos_text_normalized.append(word) pos_texts_normalized.append(' '.join(pos_text_normalized)) neg_texts_normalized = [] for text in neg_texts: neg_text_normalized = [] for word in text.split(): # normalisasi word = kamus_hasil[word] if word not in stopwords: word = stemmer.stem(word) if word not in stopwords: neg_text_normalized.append(word) neg_texts_normalized.append(' '.join(neg_text_normalized)) return pos_texts_normalized, neg_texts_normalized
def predict_news_title(): title_args = request.args.get('q') sw_remover = StopWordRemoverFactory().create_stop_word_remover() stemmer = StemmerFactory().create_stemmer() vectorizer = pickle.load(open("vectorizer.pickle", "rb")) model = pickle.load(open("final_model.pickle", "rb")) title_preprocessed = preprocess(title_args, sw_remover, stemmer) title = vectorizer.transform([title_preprocessed]) predicted_label = model.predict(title)[0] result = { 'title': title_args, 'title_cleaned': title_preprocessed, 'predicted_label': predicted_label } return make_response(jsonify(result), 200)
def preprocessing(input_path=None): #, stopword=stopword, stemmer=stemmer): factori = StemmerFactory() stemmer = factori.create_stemmer() factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() tokenizer = RegexpTokenizer(r'\w+') arr_praproses = list() with open (input_path, 'r', encoding='"ISO-8859-1"') as input : reader = input.read().split("\n") #Akses data per baris for indeks in range(len(reader)): lowcase_word = reader[indeks].lower() #case folding lowcase data perbaris stopw = stopword.remove(lowcase_word) #uncomment jika pakai stopword removal stemming = stemmer.stem(stopw) #uncomment jika pakai stemming tokens = tokenizer.tokenize(stemming) #Tokenisasi Kalimat, tergantung proses terakhirnya, stemming atau stopword atau hanya casefolding output = list() for kata in tokens: output.append(kata) #proses stemming per-kata dalam 1 kalimat sentence = " ".join(output) + '' arr_praproses.append(sentence) #tampung kalimat hasil stemm ke arr_praproses return arr_praproses
def term_in_documents_frequency(text_sentences, dict, queries, is_indonesian): if(is_indonesian): factory = StemmerFactory() stemmer = factory.create_stemmer() else: stemmer = PorterStemmer() frequency_matrix = {} for docnum, sentences in text_sentences.items(): freq_table = {} for sent in sentences: words = word_tokenize(sent) for word in words: word = word.lower() word = stemmer.stem(word) if word in dict: continue if word in queries: if word in freq_table: freq_table[word] += 1 else: freq_table[word] = 1 frequency_matrix[docnum] = freq_table return frequency_matrix
def stemming_words(words): # Import Library from Sastrawi.Stemmer.StemmerFactory import StemmerFactory factory = StemmerFactory() stemmer = factory.create_stemmer() from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() # Lakukan Stemming hasil1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', words) hasil2 = hasil1.encode('ascii', 'ignore').decode('ascii') hasil3 = ' '.join(word for word in hasil2.split(' ') if not word.startswith('#')) hasil4 = ' '.join(word for word in hasil3.split(' ') if not word.startswith('@')) katadasar = stemmer.stem(str(hasil4)) #hapus stopword/hapus kata dasar dengan menggunakan metode sastrawi stop = stopword.remove(katadasar) hasil5 = (" ".join(stop.split())) # Kembalikan hasil stemming return hasil5
def cleanTweets(Tweets): factory = StopWordRemoverFactory(); stopwords = set(factory.get_stop_words()+['twitter','rt','pic','com','yg','ga','https']) factory = StemmerFactory(); stemmer = factory.create_stemmer() for i,tweet in enumerate(tqdm(Tweets)): txt = tweet['fullTxt'] # if you want to ignore retweets ==> if not re.match(r'^RT.*', txt): txt = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',txt)# clean urls txt = txt.lower() # Lowercase txt = Tokenizer.tokenize(txt) symbols = set(['@']) # Add more if you want txt = [strip_non_ascii(t,symbols) for t in txt] #remove all non ASCII characters txt = ' '.join([t for t in txt if len(t)>1]) Tweets[i]['cleanTxt'] = txt # this is not a good Python practice, only for learning. txt = stemmer.stem(txt).split() Tweets[i]['nlp'] = ' '.join([t for t in txt if t not in stopwords]) return Tweets
def sastrawi(): """ Load stemming model using Sastrawi, this also include lemmatization. Returns ------- result: malaya.stem.Sastrawi class """ try: from Sastrawi.Stemmer.StemmerFactory import StemmerFactory except BaseException: raise ModuleNotFoundError( 'PySastrawi not installed. Please install it by `pip install PySastrawi` and try again.' ) return Sastrawi(StemmerFactory())
class PreprocessUtil: """ collection of preprocessing utility """ __remover = StopWordRemoverFactory().create_stop_word_remover() __stemmer = StemmerFactory().create_stemmer() @staticmethod def symbol_remover(text: str) -> str: """ remove symbol from text :parameter text: str :return: str example: >>> PreprocessUtil.symbol_remover("naufal, afif") naufal afif """ return text.translate(str.maketrans('','',string.punctuation)).lower() @classmethod def stopword_remover(cls, text: str) -> str: """ remove stopword from text :parameter text: str :return: str example: >>> PreprocessUtil.stopword_remover("naufal dan afif") naufal afif """ return cls.__remover.remove(text) @classmethod def stemmer(cls, text: str) -> str: """ replace word with it's root :parameter text: str :return: str example: >>> PreprocessUtil.stemmer("naufal berlari") naufal lari """ return cls.__stemmer.stem(text)
class Preprocessor(): def __init__(self): self.stopwords = StopWordRemoverFactory().get_stop_words() self.stemmer = StemmerFactory().create_stemmer() def stemming(self, words): return self.stemmer.stem(words) def tokenizing(self, str, delimiter=" "): return str.split(delimiter) def preprocess(self, words): return [ token for token in self.tokenizing(self.stemming(words)) if token not in self.stopwords ]
def process(text): # Normalizing _query = text.replace("'", "") # ***PRE-PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _query = stopword.remove(_query) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _query = stemmer.stem(_query) return _query
class PreProcessTweets: factory = StopWordRemoverFactory() get_stop_words = factory.get_stop_words() factory1 = StemmerFactory() stemmer = factory1.create_stemmer() def __init__(self): self._stopwords = StopWordRemoverFactory.get_stop_words(self) # self._stopwords = set(stopwords.words('indonesian') + list(punctuation) + ['AT_USER', 'URL']) def processTweets(self, list_of_tweets): processedTweets = [] for tweet in list_of_tweets: processedTweets.append( (self._processTweet(tweet["text"]), tweet["label"])) return processedTweets def _processTweet(self, tweet): punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~''' tweet = tweet.lower() # convert text to lower-case tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) # remove URLs tweet = re.sub('@[^\s]+', '', tweet) # remove usernames tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag tweet = "".join( (char for char in tweet if char not in string.punctuation)) tweet = re.sub('\s+', ' ', tweet).strip() tweet = re.sub(r"\d", "", tweet) # Ambil Stopword bawaan stop_factory = StopWordRemoverFactory().get_stop_words() more_stopword = open("stopword.txt", "r").read().split() # Merge stopword data = stop_factory + more_stopword dictionary = ArrayDictionary(data) str = StopWordRemover(dictionary) factory1 = StemmerFactory() #stemming factory stemmer = factory1.create_stemmer() #buat stemming # tweet = str.remove(tweet) # tweet = stemmer.stem(tweet) # stemming tweet tweet = word_tokenize( tweet) # remove repeated characters (helloooooooo into hello) # return [word for word in tweet if word not in self._stopwords] return tweet
def blogging(): # Insert to Blog DB _question = request.form["question"] _answer = request.form["answer"] faq = FAQRepository().insert(Faq(0, _question, _answer)) # Normalizing _question = _question.replace("'", "") _answer = _answer.replace("'", "") # ***PRE-PROCESSING*** # Stopword factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() _question = stopword.remove(_question) _answer = stopword.remove(_answer) # Stemming factory = StemmerFactory() stemmer = factory.create_stemmer() _question = stemmer.stem(_question) _answer = stemmer.stem(_answer) # Get all unique word from question blob = tb(_question) uniqWord = list(set(blob.words)) # Count all unique word in question sumOfWord = 0 for word in uniqWord: _n = blob.words.count(word) sumOfWord += _n # Get Average average = sumOfWord / len(blob) # Get Over Average Word for word in uniqWord: n = blob.words.count(word) if (n > average): # Insert to Keyword DB KeywordRepository().insert(Keyword(faq.id_faq, word, n)) return render_template('faq.html')
class Preprocess: def __init__(self): self.stemmer = StemmerFactory().create_stemmer() self.remover = StopWordRemoverFactory().create_stop_word_remover() def preprocess(self, text): # # 1 stemming text_stem = self.stemmer.stem(text) # # # 2 hapus stop words text_clean = self.remover.remove(text_stem) # # # 3 tokenization # # 3.1 lowercase lowercase = text_clean.lower() preprocessed_text = lowercase.translate(None, string.punctuation).split() return preprocessed_text
class Preprocessing(): stop_words = stopwords.words('indonesian') factory = StemmerFactory() stemmer = factory.create_stemmer() def initial_clean(self, text): """ Function to clean text of websites, email addresess and any punctuation We also lower case the text """ text = re.sub( "((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text) text = re.sub("[^a-zA-Z ]", "", text) text = text.lower() # lower case the text text = nltk.word_tokenize(text) return text def remove_stop_words(self, text): """ Function that removes all stopwords from text """ return [word for word in text if word not in self.stop_words] def stem_words(self, text): """ Function to stem words, so plural and singular are treated the same """ try: text = [self.stemmer.stem(word) for word in text] text = [word for word in text if len(word) > 1] except IndexError: # the word "oed" broke this, so needed try except pass return text def preprocess(self, data): """ This function applies all the functions above into one """ # if data == "": # data = self.text return self.stem_words(self.remove_stop_words( self.initial_clean(data)))
class Preprocessing : def __init__(self): print("Initializing preprocessing...") self.factory = StemmerFactory() self.stemmer = self.factory.create_stemmer() pass def processtext(self, text): text = text.lower() text = re.sub(r'\&\w*;', '', text) text = re.sub('@[^\s]+','',text) text = re.sub(r'\$\w*', '', text) text = text.lower() text = re.sub(r'https?:\/\/.*\/\w*', '', text) text = re.sub(r'#\w*', '', text) text = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', text) text = re.sub(r'\b\w{1,2}\b', '', text) text = re.sub(r'\s\s+', ' ', text) text = text.lstrip(' ') text = ''.join(c for c in text if c <= '\uFFFF') return text def stem(self, text): text = self.stemmer.stem(text) return text def remove_stopwords(self, param): f = "id_stopwords.txt" with open(f, 'r') as my_stopwords: stopwords_list = my_stopwords.read() list = param.split() index = [] i = 0 d = "" while i < len(list): if list[i] not in stopwords_list: index.append(i) i += 1 for k in index: d += list[k]+" " #s = ' '.join(list) return d.strip()
class Stemmer: factory = None stemmer = None def __init__(self): self.factory = StemmerFactory() self.stemmer = self.factory.create_stemmer() def stem(self, sentence, map_emoticon, map_senti): new_sentence = "" for word in sentence.split(): # If it is emoticon if word in map_emoticon: new_sentence = new_sentence + word + " " # If it is a sentiment word elif word in map_senti: new_sentence = new_sentence + word + " " else: # Only get alphabet, remove emoji if (word.isalpha()): new_sentence = new_sentence + self.stemmer.stem(word) + " " return new_sentence
class SimpleIndonesianPreprocessor(BaseEstimator, TransformerMixin): """ Simple Indonesian text preprocessor """ def __init__(self, stem=True, stopwords=True, verbose=True): self.stemmer = StemmerFactory().create_stemmer() if stem else None self.stopwords = [] if stopwords: with open(STOPWORDS_FILE, 'r') as f: self.stopwords = f.read().splitlines() self.verbose = verbose def fit(self, X, y=None): return self def inverse_transform(self, X): return [" ".join(doc) for doc in X] def transform(self, X): results = [] if self.verbose: print('Preprocessing..') bar = progressbar.ProgressBar() for doc in bar(X): results.append(list(self.tokenize(doc))) return results else: return [list(self.tokenize(doc)) for doc in X] def tokenize(self, document): if self.stemmer: # stem and split by whitespaces for token in self.stemmer.stem(document).split(): if token not in self.stopwords: yield token else: for token in document.lower().split(): if token not in self.stopwords: yield token
class TrainingData: nama = [] factory = StemmerFactory() factory_remove_word = StopWordRemoverFactory() stemmer = factory.create_stemmer() stopword = factory_remove_word.create_stop_word_remover() def read_file_data(url_args): # file = textract.process(Uri,method="tesseract") file = open(url_args, 'rb') return file def clean_words(words_args,stemmer=stemmer, stopword_args = stopword): clean_words = re.sub("[(){}<>\",\-*0-9;']", " ", words_args) stemmed_word = stemmer.stem(clean_words) output = stopword_args.remove(stemmed_word) return output
class FeatureAnnotator: def __init__(self): self.nlp = stanza.Pipeline("id", use_gpu=False) self.stemmer = StemmerFactory().create_stemmer() self.ner = get_entities # Set POS Tagger self.pos_tagger = nltk.tag.CRFTagger() self.pos_tagger.set_model_file( 'pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger') def annotate(self, sentence): annotation = defaultdict(list) sentence = sentence.translate(str.maketrans('', '', string.punctuation)) doc = self.nlp(sentence) annotation['ner_tags'] = self.ner(sentence) word_dict = defaultdict(int) for sent in doc.sentences: for idx, word in enumerate(sent.words): annotation['tokens'].append(word.text) stemmed_word = self.stemmer.stem(word.text) if (annotation['ner_tags'][idx] in ['PER', 'ORG']): stemmed_word = word.text.lower() annotation['lemmas'].append( stemmed_word + '_{}'.format(word_dict[stemmed_word])) annotation['dependency'].append( dict(relation=word.deprel, head=word.head)) annotation['pos_tags'] = [ tag[1] for tag in self.pos_tagger.tag(annotation['tokens']) ] return annotation
class Test_StemmerFactoryTest(unittest.TestCase): def setUp(self): self.factory = StemmerFactory() return super(Test_StemmerFactoryTest, self).setUp() def test_createStemmerReturnStemmer(self): stemmer = self.factory.create_stemmer() self.assertIsNotNone(stemmer) #self.assertIsInstance(stemmer, Stemmer) def test_fungsional(self): factory = StemmerFactory() stemmer = factory.create_stemmer() sentence = 'malaikat-malaikat-Nya' expected = 'malaikat' output = stemmer.stem(sentence) if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected)) def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file()
def setUp(self): self.factory = StemmerFactory() return super(Test_StemmerFactoryTest, self).setUp()
def rmStem(pars): factory = StemmerFactory() stripped= strip_tags(pars) stemmer = factory.create_stemmer() clean = stemmer.stem(str(stripped)) #Stemming return clean
def setUp(self): stemmerFactory = StemmerFactory() self.stemmer = stemmerFactory.create_stemmer() return super(Test_StemmerTest, self).setUp()
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from collections import Counter akun = ['548904824', '255409050', '480224156', '63433517', '82552414', '61379637', '79994423', '47251716', '260043508'] # ['@IndosatCare','@Telkomsel','@myXLCare','@triindonesia','@myXL','@IM3Ooredoo','@AXISgsm','@ask_AXIS','@simPATI'] kata_kunci = ['lambat', 'lelet', 'lola', 'lemot', 'koneksi', 'gsm', '3g', '4g', 'hsdpa', 'edge', 'jaring', 'ganggu'] cred = credentials.Certificate('kunci2.json') firebase_admin.initialize_app(cred) db = firestore.client() tweet_ref = db.collection('Tweet') kata_ref = db.collection("kata_kunci") last_ref = db.collection("lasttweet") factory = StemmerFactory() stemmer = factory.create_stemmer() def tweetstruct(user, text, t): data = { 'username': user, 'text': text, 'time': t, } return data def storetweet(id, input): try: ref = tweet_ref.document(id)
def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file()