def preprocessing(dataset): stemmer = StemmerFactory().create_stemmer() stopwords = StopWordRemoverFactory().create_stop_word_remover() for row in dataset: row['message'] = row.get('message').casefold() row['message'] = re.sub(r"[0-9]", "", row.get('message')) row['message'] = re.sub('[' + string.punctuation + ']', "", row.get('message')) row['message_stopwords'] = stopwords.remove(row['message']) row['message_stemmed'] = stemmer.stem(row['message_stopwords']) row['message_tokenized'] = word_tokenize(row['message_stemmed'])
def train(self): """ NOTE: Implement your training procedure in this method. """ # read data.csv using pandas and drop nan data = pd.read_csv("data.csv").dropna() # get article_content and transform to list contents = data["article_content"].values.tolist() # get article_topic and transform to list topics = data["article_topic"].values.tolist() # import library to tokenize and remove punctuation tokenizer = RegexpTokenizer(r'\w+') # stopword removal for bahasa indonesia stopword = StopWordRemoverFactory().create_stop_word_remover() # list to save clean contents clean_contents = list() # looping the contents, and preprocess for each content for content in contents: # case folding the sentence to be lowcase lowcase_word = content.lower() # remove stopword from the content stop_word = stopword.remove(lowcase_word) # tokenize the content sentence_token = tokenizer.tokenize(stop_word) # initialize a list for clean token clean_tokens = list() for token in sentence_token: # append token to the list after lower it clean_tokens.append(token) # transform a token to be sentence sentence = " ".join(clean_tokens) + '' # append clean sentence clean_contents.append(sentence) # count vectorizer X_train_counts = self.count_vect.fit_transform(clean_contents) # create tfidf from count vectorizer X_train_tfidf = self.tfidf_transformer.fit_transform(X_train_counts) # split data to train and test set > test 10%, train 90% X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, topics, test_size=0.1) # train a model self.svm_clf.fit(X_train, y_train) # prediction for x_test prediction = self.svm_clf.predict(X_test) # model accuracy for x_test accuracy = accuracy_score(y_test, prediction) # print accuracy print(accuracy)
def __init__(self, input, file_location): data = self.dataFromFile(file_location) stopword = StopWordRemoverFactory().create_stop_word_remover() stemmer = StemmerFactory().create_stemmer() input = stopword.remove(input.lower()) input = stemmer.stem(input) valid = 0 for i in range(len(data)): kal = stopword.remove(data[i][0].lower()) kal = stemmer.stem(kal) if (self.bm(input.lower(), kal.lower()) != -1): if (valid == 0): percent = len(input) * 100 / len(kal) # print("Confidence1 : " + str(percent)) if (percent > 80): self.answere = data[i][1] valid = 1 else: if valid == 0: if (self.bm2(input.lower(), kal.lower()) >= 80): # print("Confidence2 : " + str(bm2(input.lower(), kal.lower()))) self.answere = data[i][1] valid = 1
def clean_text(self, data): stopword = StopWordRemoverFactory().create_stop_word_remover() stemmer = StemmerFactory().create_stemmer() data = re.sub('[^a-zA-Z]',' ', str(data).lower()) data = re.sub('\byok\b |\byuk\b', 'ayo', data) data = re.sub('\bmager\b', 'males', data) data = re.sub('\bmalas\b', 'males', data) data = re.sub('\bmls\b', 'males', data) data = re.sub('\bkuy\b', 'yuk', data) data = re.sub('\borg\b', 'orang', data) data = re.sub('\bjg\b', 'juga', data) data = re.sub('\budh\b', 'sudah', data) data = re.sub('\bmangat\b', 'semangat', data) data = re.sub('\bcemungut\b', 'semangat', data) data = re.sub('\bgas\b', 'yuk', data) data = re.sub('\benakeun\b', 'enak', data) data = re.sub('\bnaek\b', 'naik', data) data = re.sub('\bmmg\b', 'memang', data) data = re.sub('\bga\b', 'engga', data) data = re.sub('\bengga\b', 'tidak', data) data = re.sub('\bttg\b', 'tentang', data) data = re.sub('\brush hour\b', 'jam sibuk', data) data = re.sub('\bku\b', 'aku', data) data = re.sub('\bgak\b', 'tidak', data) data = re.sub('\bdgn\b', 'dengan', data) data = re.sub('\bbailk\b', 'pulang', data) data = re.sub('\bgatau\b', 'tidak tahu', data) data = re.sub('\bbat\b', 'banget', data) data = re.sub('\bampe\b', 'sampai', data) data = re.sub('\blg\b', 'sedang', data) data = re.sub('\banjay\b', 'asik', data) data = re.sub('\banjg\b', 'anjing', data) data = re.sub('\banjiing\b', 'anjing', data) data = re.sub('\bantum\b', 'kamu', data) data = re.sub('\basiq\b |\basyique\b |\basik\b', 'asyik', data) data = re.sub('\bbgt\b |\bbanget\b |\bbanged\b', 'sangat', data) data = re.sub('\bribet\b', 'repot', data) data = data.split() data = ' '.join(data) #setelah ngeganti baru ilangin stopword dan imbuhan kata dibawah ini #sastrawi remove stopwords data = stopword.remove(data) #stopword nya udah di di provide sastrawi #sastrawi stemming data = stemmer.stem(data) return data
def preprocess_text(input): #lowercase all character in the text text = input[0] text = text.lower() #remove punctuation text = text.translate(str.maketrans("","",string.punctuation)) #remove leading and trailing whitespace text = text.strip() #remove StopWord stopword = StopWordRemoverFactory().create_stop_word_remover() text = stopword.remove(text) #stemming stemmer = StemmerFactory().create_stemmer() text = stemmer.stem(text) return text
class Preprocess: def __init__(self): self.stemmer = StemmerFactory().create_stemmer() self.remover = StopWordRemoverFactory().create_stop_word_remover() def preprocess(self, text): # # 1 stemming text_stem = self.stemmer.stem(text) # # # 2 hapus stop words text_clean = self.remover.remove(text_stem) # # # 3 tokenization # # 3.1 lowercase lowercase = text_clean.lower() preprocessed_text = lowercase.translate(None, string.punctuation).split() return preprocessed_text
def respond(strg): levenshtein = Levenshtein() stemmer = StemmerFactory().create_stemmer() stopwords = StopWordRemoverFactory().create_stop_word_remover() kategori = model.predict([strg]) txt = stopwords.remove(strg) txt = stemmer.stem(txt) best = 1000 res = [] for words in dataset: if (words['category'] == kategori): distance = levenshtein.distance(txt, words['message_stemmed']) if (distance < best): best = distance res = words return res['respond']
def index(hashs, terms): for word in terms: if word in hashs: hashs[word] += 1 else: hashs[word] = 1 print('Indexing ...') for path in sorted(IN_DIR.glob('*/*.html')): with open(path.resolve(), 'r', encoding='utf-8') as file: df[path.name] = dict() content = get_text(['title', 'top', 'middle', 'bottom'], file.read()) content = content.translate(str.maketrans('','', punctuation)) content = stopword.remove(content) terms = stemmer.stem(content.lower()).split() index(df[path.name], terms) index(tf, terms) print('Indexing done!\n') print('Calculating idf for terms...') for term, freq in tf.items(): df_i = 0 for doc, tf_doc in df.items(): df_i += 1 if term in tf_doc else 0 idf[term] = (1 + math.log2(len(df)/df_i)) if df_i != 0 else 1 print('Calculated!\n') with open(BASE_DIR / 'words_score.txt', 'w', encoding='utf-8') as file:
class TextSummarizer: def __init__(self, title: str, plot: str, human_synopsis: str): self.title = title self.plot = plot self.human_synopsis = human_synopsis self.stopwords = StopWordRemoverFactory().create_stop_word_remover() self.stemmer = StemmerFactory().create_stemmer() def __text_to_sentences(self, text: str) -> List[str]: regex = re.compile('\.\n\n|\.\n|\. |\.$') sentences = regex.split(text) return sentences def __stem_sentence(self, sentence: str) -> str: return self.stemmer.stem(sentence) def __stop_word_removal(self, words: List[str]) -> List[str]: temp_words = [] for word in words: if word.lower() in self.title.lower(): temp_words.append(word) else: temp = self.stopwords.remove(word) if temp: temp_words.append(temp) return temp_words def __preprocess_text(self, text: str) -> tuple: temp_sentences = self.__text_to_sentences(text) sentences = [] preprocessed_sentences = [] for sentence in temp_sentences: if len(sentence) < 2: continue stemmed_sentence = self.__stem_sentence(sentence.lower()) tokenized_sentence = nltk.tokenize.word_tokenize(stemmed_sentence) removed_stop_word_sentence = self.__stop_word_removal( tokenized_sentence) if len(removed_stop_word_sentence) < 2: continue sentences.append(sentence) preprocessed_sentences.append(removed_stop_word_sentence) return sentences, preprocessed_sentences def __sentence_similarity(self, sent1, sent2): """ calculate the similarity between sentence! return distance between sentences """ sent1 = [w.lower() for w in sent1] sent2 = [w.lower() for w in sent2] all_words = list(set(sent1 + sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # build the vector for the first sentence for w in sent1: vector1[all_words.index(w)] += 1 # build the vector for the second sentence for w in sent2: vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2) def __build_similarity_matrix(self, sentences): """ make a matrix to plot the similarity between sentences in a file return matrix """ # Create an empty similarity matrix similarity_matrix = np.zeros((len(sentences), len(sentences))) for idx1 in range(len(sentences)): for idx2 in range(len(sentences)): if idx1 == idx2: # ignore if both are same sentences continue similarity_matrix[idx1][idx2] = self.__sentence_similarity( sentences[idx1], sentences[idx2]) return similarity_matrix def summarize(self, top_n=5): summarize_text = [] # Step 1 - text preprocessing plot_sentences, plot_pre_sentences = self.__preprocess_text(self.plot) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = self.__build_similarity_matrix( plot_pre_sentences) print(sentence_similarity_martix) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array( sentence_similarity_martix) plot_scores = nx.pagerank(sentence_similarity_graph) # Step 4 - Sort the rank and pick top sentences ranked_sentence = [] for i in range(len(plot_scores)): ranked_sentence.append([plot_scores[i], plot_sentences[i], i]) ranked_sentence.sort(key=lambda x: x[0], reverse=True) top_n = min(top_n, len(plot_sentences)) summary = ranked_sentence[0:top_n] summary.sort(key=lambda x: x[2]) summary = [i[1] for i in summary] summarize_text = "" for i in range(top_n): summarize_text += "".join(summary[i]) + ". " # Step 5 - Offcourse, output the summarize texr return summarize_text @staticmethod def generate_from_file(title, plotfilepath, synopsisfilepath): plot = "" synopsis = "" with open(plotfilepath, "r") as plot_file: plot = plot_file.read() with open(synopsisfilepath, "r") as synopsis_file: synopsis = synopsis_file.read() ts = TextSummarizer(title, plot, synopsis) return ts.summarize()
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary from string_matching_algorithm import * import re as regex from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory # factory = StopWordRemoverFactory() newStopFactory = StopWordRemoverFactory().get_stop_words() newStopFactory.remove("sampai") newStopFactory.remove("dan") newStopFactory.append("deadline") newStopFactory.append("mengenai") newStopFactory.append("tanggal") stopword = StopWordRemover(ArrayDictionary(newStopFactory)) # Regex untuk bulan JANUARI_REGEX = '[Jj]an(?:uari)?' FEBRUARI_REGEX = '[Ff]eb(?:ruari)?' MARET_REGEX = '[Mm]ar(?:et)?' APRIL_REGEX = '[Aa]pr(?:il)?' MEI_REGEX = '[Mm]ei' JUNI_REGEX = '[Jj]uni?' JULI_REGEX = '[Jj]uli?' AGUSTUS_REGEX = '[Aa]gu(?:stus)?' SEPTEMBER_REGEX = '[Ss]ep(?:tember)?' OKTOBER_REGEX = '[Oo]kt(?:ober)?' NOVEMBER_REGEX = '[Nn]ov(?:ember)?' DESEMBER_REGEX = '[Dd]es(?:ember)?' # Regex untuk keutuhan tanggal ANYTHING = '.*' DAY_REGEX = '(0[1-9]|[1-2][0-9]|3[0-1])'
def rem_stop_words(text): sw_rem = StopWordRemoverFactory().create_stop_word_remover() return sw_rem.remove(text)
class SpamClassifier(object): def __init__(self, tweets, labels): self.tweets, self.labels = tweets, labels self.clean_tweets = [] self.conv_tweets = [] self.stem_tweets = [] self.processed_tweets = [] self.spam_tweets, self.ham_tweets = (labels == 1).sum(), ( labels == 0).sum() self.total_tweets = len(self.tweets) self.testdata = [] self.testdata_terproses = [] self.vocab1 = list() self.vocab2 = list() self.vocab3 = list() self.prior_spam = 0.0 self.prior_ham = 0.0 self.tf_spam1 = dict() self.tf_ham1 = dict() self.tf_spam2 = dict() self.tf_ham2 = dict() self.tf_spam3 = dict() self.tf_ham3 = dict() self.dfw1 = dict() self.dfw2 = dict() self.dfw3 = dict() self.pwtfidf_spam1 = dict() self.pwtfidf_ham1 = dict() self.pwtfidf_spam2 = dict() self.pwtfidf_ham2 = dict() self.pwtfidf_spam3 = dict() self.pwtfidf_ham3 = dict() self.stemmer = StemmerFactory().create_stemmer() self.stop = StopWordRemoverFactory().create_stop_word_remover() self.stop.dictionary.add('lasturladdr') self.stop.dictionary.add('rt') def praproses(self): for i in range(len(self.tweets)): self.clean_tweets.append(clean_text(self.tweets[i])) self.conv_tweets.append(konversi(self.clean_tweets[i], pengganti)) self.stem_tweets.append(self.stemmer.stem(self.conv_tweets[i])) self.processed_tweets.append(self.stop.remove(self.stem_tweets[i])) def praprosestext(self, teks): cteks = clean_text(teks) konv_teks = konversi(cteks, pengganti) stemteks = self.stemmer.stem(konv_teks) nosw_teks = self.stop.remove(stemteks) return nosw_teks def hitungTFDF(self): for i in range(self.total_tweets): dfw = {} tfunigram = createToken(self.processed_tweets[i], gram=1) for word in tfunigram: if dfw.get(word, 0) == 0: # hitung dokumen berisi word dfw[word] = 1 self.dfw1[word] = self.dfw1.get(word, 0) + 1 if self.labels[i]: self.tf_spam1[word] = self.tf_spam1.get(word, 0) + 1 else: self.tf_ham1[word] = self.tf_ham1.get(word, 0) + 1 dfw = {} tfbigram = createToken(self.processed_tweets[i], gram=2) for word in tfbigram: if dfw.get(word, 0) == 0: # hitung dokumen berisi word dfw[word] = 1 self.dfw2[word] = self.dfw2.get(word, 0) + 1 if self.labels[i]: self.tf_spam2[word] = self.tf_spam2.get(word, 0) + 1 else: self.tf_ham2[word] = self.tf_ham2.get(word, 0) + 1 dfw = {} tftrigram = createToken(self.processed_tweets[i], gram=3) for word in tftrigram: if dfw.get(word, 0) == 0: # hitung dokumen berisi word dfw[word] = 1 self.dfw3[word] = self.dfw3.get(word, 0) + 1 if self.labels[i]: self.tf_spam3[word] = self.tf_spam3.get(word, 0) + 1 else: self.tf_ham3[word] = self.tf_ham3.get(word, 0) + 1 self.vocab1 = list(dict(self.tf_spam1, **self.tf_ham1).keys()) self.vocab2 = list(dict(self.tf_spam2, **self.tf_ham2).keys()) self.vocab3 = list(dict(self.tf_spam3, **self.tf_ham3).keys()) def train(self): self.praproses() self.hitungTFDF() self.prior_spam = self.spam_tweets / self.total_tweets self.prior_ham = self.ham_tweets / self.total_tweets #****Hitung tfidf untuk 1-gram dan 2-gram*** #***** 1-gram**** for word in self.tf_spam1: self.pwtfidf_spam1[word] = self.tf_spam1[word] \ * log10(len(self.tweets) / self.dfw1[word]) for word in self.tf_ham1: self.pwtfidf_ham1[word] = self.tf_ham1[word] \ * log10(len(self.tweets) / self.dfw1[word]) #===2 gram=== for word in self.tf_spam2: self.pwtfidf_spam2[word] = self.tf_spam2[word] \ * log10(len(self.tweets) / (self.dfw2[word])) for word in self.tf_ham2: self.pwtfidf_ham2[word] = self.tf_ham2[word] \ * log10(len(self.tweets) / self.dfw2[word]) def classify1(self, text, metode): self.metode = metode + '1gr' proses_text = self.praprosestext(text) self.testdata_terproses.append(proses_text) token = createToken(proses_text, gram=1) pSpam = log10(self.prior_spam) pHam = log10(self.prior_ham) for word in token: #==hitung probbilitas spam if metode == 'tfidf': pSpam += log10(self.pwtfidf_spam1.get(word, 1) + 1) pSpam -= log10( sum(self.pwtfidf_spam1.values()) + len(self.tf_spam1)) if metode == 'bow': pSpam += log10( (self.tf_spam1.get(word, 0) + 1) / (sum(self.tf_spam1.values()) + len(self.vocab1))) #== Hitung untuk ham ===== if metode == 'tfidf': pHam += log10(self.pwtfidf_ham1.get(word, 1) + 1) pHam -= log10( sum(self.pwtfidf_ham1.values()) + len(self.tf_ham1)) if metode == 'bow': pHam += log10((self.tf_ham1.get(word, 0) + 1) / (sum(self.tf_spam1.values()) + len(self.vocab1))) #print("pSpam: ",pSpam," pHam: ",pHam) return pSpam >= pHam def classify2(self, text, metode): self.metode = metode + '2gr' proses_text = self.praprosestext(text) self.testdata_terproses.append(proses_text) token = createToken(proses_text, gram=2) pSpam = log10(self.prior_spam) pHam = log10(self.prior_ham) for word in token: #==hitung probbilitas spam if metode == 'tfidf': pSpam += log10(self.pwtfidf_spam2.get(word, 1) + 1) pSpam -= log10( sum(self.pwtfidf_spam2.values()) + len(self.tf_spam2)) else: pSpam += log10( (self.tf_spam2.get(word, 0) + 1) / (sum(self.tf_spam2.values()) + len(self.vocab2))) #== Hitung untuk ham ===== if metode == 'tfidf': pHam += log10(self.pwtfidf_ham2.get(word, 1) + 1) pHam -= log10( sum(self.pwtfidf_ham2.values()) + len(self.tf_ham2)) else: pHam += log10((self.tf_ham2.get(word, 0) + 1) / (sum(self.tf_spam2.values()) + len(self.vocab2))) #print('pSpam: ',pSpam,' pHam: ',pHam) return pSpam >= pHam def sbclassify(self, text): self.metode = 'stupidbackoff' proses_text = self.praprosestext(text) self.testdata_terproses.append(proses_text) hamscore = 0.0 spamscore = 0.0 words = createToken(proses_text, gram=2) for word in words: wordtoken = word.split() tokenprev = wordtoken[0] tokennext = wordtoken[1] if word in self.tf_ham2: bicount = self.tf_ham2[word] bi_unicount = self.tf_ham1[tokenprev] hamscore += log10(bicount) hamscore -= log10(bi_unicount) else: if tokennext in self.tf_ham1: unicount = self.tf_ham1[tokennext] else: unicount = 0.4 hamscore += log10(0.4) hamscore += log10(unicount) hamscore -= log10( sum(self.tf_ham1.values()) + len(self.vocab1)) if word in self.tf_spam2: bicount2 = self.tf_spam2[word] bi_unicount2 = self.tf_spam1[tokenprev] spamscore += log10(bicount2) spamscore -= log10(bi_unicount2) else: if tokennext in self.tf_spam1: unicount2 = self.tf_spam1[tokennext] else: unicount2 = 0.4 spamscore += log10(0.4) spamscore += log10(unicount2) spamscore -= log10( sum(self.tf_spam1.values()) + len(self.vocab1)) #spamscore += log10(self.prior_spam) #hamscore += log10(self.prior_ham) return spamscore >= hamscore def predict(self, test_data, metode, gram): '''metode = stbo = stupid backoff' bow = bag off word tfidf = with tfidf ''' self.testdata = [] self.testdata_terproses = [] result = dict() if metode == 'stbo': for (i, tweet) in enumerate(test_data): result[i] = int(self.sbclassify(tweet)) else: if gram == 1: for (i, tweet) in enumerate(test_data): result[i] = int(self.classify1(tweet, metode)) if gram == 2: for (i, tweet) in enumerate(test_data): result[i] = int(self.classify2(tweet, metode)) return result def metrics(self, labels, predictions, tweets): etext = [] eptext = [] elabel = [] true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0 for i in range(len(labels)): true_pos += int((labels[i] == 1) and (predictions[i] == 1)) true_neg += int(labels[i] == 0 and predictions[i] == 0) if (labels[i] == 0 and predictions[i] == 1): false_pos += 1 etext.append(tweets[i]) eptext.append(self.praprosestext(tweets[i])) elabel.append('fp') if (labels[i] == 1 and predictions[i] == 0): false_neg += 1 etext.append(tweets[i]) eptext.append(self.praprosestext(tweets[i])) elabel.append('fn') edf = pd.DataFrame(list(zip(etext, eptext, elabel)), columns=['text', 'stemmedtext', 'label']) filename = 'data/false_' + self.metode + '.xlsx' writer = pd.ExcelWriter(filename, engine='xlsxwriter') edf.to_excel(writer, sheet_name='Sheet1') writer.save() precision = true_pos / (true_pos + false_pos) recall = true_pos / (true_pos + false_neg) Fscore = 2 * precision * recall / (precision + recall) accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg) print("Precision: ", precision) print("Recall: ", recall) print("F-score: ", Fscore) print("Accuracy: ", accuracy) print('\n==Confusion Matrix===') print("True Positiv: ", true_pos) print("False Positiv: ", false_pos) print("True Negativ: ", true_neg) print("False Negativ: ", false_neg)
"Halo, namaku Alice.<br>Selamat datang di ChatBot Alice.<br>Tanyakan apapun kepadaku dan aku akan mencoba menjawabnya...<br>...dengan KMP, BM, dan Regex." ) else: stopword = StopWordRemoverFactory().create_stop_word_remover() pertanyaan = [] purePertanyaan = [] jawaban = [] for line in open('pertanyaan.txt').readlines(): i = 0 j = 0 while (line[j] != ' '): j += 1 while (line[i] != '?'): i += 1 purePertanyaan.append((line[j + 1:i + 1])) pertanyaan.append(stopword.remove((line[j + 1:i].lower()))) jawaban.append(line[i + 2:len(line) - 1]) query = re.sub('[%s]' % re.escape(string.punctuation), '', sys.argv[1].lower()) query = stopword.remove(query) querylist = query.split(' ') synonymList = [] for queryWord in querylist: synonymList.append(getSinonim(queryWord)) #dikombinasi sentenceList = [[]] for word in synonymList: newList = [] for synonym in word: for sentence in sentenceList: