class Dataset(object): """ Yield splitted sentence from TSV dataset Dataset content example: first sentence 0 second sentence 1 """ def __init__(self, csv_file, stem=True, stopwords=True, verbose=True): self.csv_file = csv_file self.stemmer = StemmerFactory().create_stemmer() if stem else None self.stopwords = [] if stopwords: with open(STOPWORDS_FILE, 'r') as f: self.stopwords = f.read().splitlines() def __iter__(self): with open(self.csv_file, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in reader: sentence = [] if self.stemmer: for token in self.stemmer.stem(row[0]).split(): if token not in self.stopwords: sentence.append(token) else: for token in row[0].lower().split(): if token not in self.stopwords: sentence.append(token) if sentence: yield (sentence)
def Preprocessing(data): print("Preprocessing") cleanData = [] tokenizer = RegexpTokenizer(r'\w+') factory_stopwords = StopWordRemoverFactory() stopwordsFact = factory_stopwords.get_stop_words() stemmer = StemmerFactory().create_stemmer() count = 0 for kalimat in data: removedHttp = re.sub(r"http\S+", '', kalimat) #hilangin link http removedPic = re.sub(r"pic.twitter\S+", '', removedHttp) #hilangin link pic.twitter lower = removedPic.lower() #casefolding tokenized = tokenizer.tokenize(lower) #tokenizer + punctuation removal stopwords = [] #Stopwords removal for kata in tokenized: if kata not in stopwordsFact: stopwords.append(kata) stemmed = [] #stemming for kata in stopwords: #stemming stemmed.append(stemmer.stem(kata)) #stemming cleanData.append(stemmed) count += 1 print(count) return cleanData
def cleanText(T, fix={}, lemma=False, lan='id', stops=set(), symbols_remove=True, min_charLen=2, max_charLen=15, fixTag=False, fixMix=True): if lemma and lan.lower().strip() == 'id': lemma = StemmerFactory().create_stemmer() pattern = re.compile( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) t = re.sub(pattern, ' ', T) #remove urls if any pattern = re.compile( r'ftp[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) t = re.sub(pattern, ' ', t) #remove urls if any t = unescape(t) # html entities fix if fixTag: t = fixTags(t) # fix abcDef t = t.lower().strip() # lowercase t = unidecode(t) t = ''.join(''.join(s)[:2] for _, s in itertools.groupby(t)) # remove repetition t = t.replace('\n', ' ').replace('\r', ' ') t = sent_tokenize(t) # sentence segmentation. String to list for i, K in enumerate(t): if symbols_remove: K = re.sub(r'[^.,_a-zA-Z0-9 -\.]', ' ', K) if lemma and lan.lower().strip() == 'id': listKata = [str(tok) for tok in TextBlob(lemma.stem(K)).words] elif lemma and lan.lower().strip() == 'en': listKata = [str(tok.lemma_) for tok in nlp_en(K)] else: listKata = [str(tok) for tok in TextBlob(K).words] if fix: for j, token in enumerate(listKata): if token in fix.keys(): listKata[j] = fix[token] if stops: listKata = [ tok for tok in listKata if tok not in stops and len(tok) >= min_charLen ] else: listKata = [tok for tok in listKata if len(tok) >= min_charLen] listKataFixed = [] if fixMix: for j, tok_ in enumerate(listKata): listKataFixed += re.split('(\d+)', tok_) listKata = listKataFixed t[i] = ' '.join(listKata) return ' '.join(t) # Return kalimat lagi
class Preprocessor: def __init__(self): self.stopwords = StopWordRemoverFactory().get_stop_words() self.stemmer = StemmerFactory().create_stemmer() def clean(self, words): return words.translate(str.maketrans("", "", ".,!?\"'#@%&/();:")) def stemming(self, words): return self.stemmer.stem(self.clean(words)) def tokenizing(self, str, delimiter=" "): return str.split(delimiter) def preprocess(self, words): return [ token for token in self.tokenizing(self.stemming(words)) if token not in self.stopwords ] def selected_preprocess(self, words, selected_words): return [ token for token in self.tokenizing(self.stemming(words)) if token not in self.stopwords and token in selected_words ]
class FeatureAnnotator: def __init__(self): self.nlp = stanfordnlp.Pipeline(lang="id",use_gpu=False, silent=True) self.stemmer = StemmerFactory().create_stemmer() self.ner = get_entities # Set POS Tagger self.pos_tagger = nltk.tag.CRFTagger() self.pos_tagger.set_model_file('pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger') def annotate(self, sentence): annotation = defaultdict(list) sentence = sentence.translate(str.maketrans('', '', string.punctuation)) doc = self.nlp(sentence) annotation['ner_tags'] = self.ner(sentence) word_dict = defaultdict(int) for sent in doc.sentences: for idx, word in enumerate(sent.words): annotation['tokens'].append(word.text) stemmed_word = self.stemmer.stem(word.text) if (annotation['ner_tags'][idx] in ['PER','ORG']): stemmed_word = word.text.lower() annotation['lemmas'].append(stemmed_word+'_{}'.format(word_dict[stemmed_word])) annotation['dependency'].append(dict(relation=word.dependency_relation, head=word.governor)) annotation['pos_tags'] = [tag[1] for tag in self.pos_tagger.tag(annotation['tokens'])] return annotation
def preprocess(self, documents): print("[{}] Preprocessing...".format(dt.now())) stemmer = StemmerFactory().create_stemmer() stopwords = self.data.get_stopword() formal_dict = self.data.get_formalization() formal_pattern = re.compile(r'\b(' + '|'.join(formal_dict.keys()) + r')\b') url_pattern = re.compile( r'((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?' ) digit_symbol_pattern = re.compile(r'\d+|[^\w\s]') user_handler_pattern = re.compile(r'@\w+') processed_docs = [] print(" ", end='') runtime = [] for doc in documents: st = timeit.default_timer() new_doc = re.sub(url_pattern, "", doc) new_doc = re.sub(user_handler_pattern, "", new_doc) new_doc = re.sub(digit_symbol_pattern, " ", new_doc) new_doc = new_doc.lower() new_doc = formal_pattern.sub(lambda x: formal_dict[x.group()], new_doc) new_doc = stemmer.stem(new_doc) new_doc = new_doc.split() new_doc = [word for word in new_doc if word not in stopwords] print('.', end='') processed_docs.append(new_doc) runtime.append(timeit.default_timer() - st) self.data.save_processed_docs(processed_docs) print("\nAverage Preprocessing time : " + str(sum(runtime) / len(runtime))) return processed_docs
def stemming(documents): stemmer = StemmerFactory().create_stemmer() stemmed = [] for document in documents: words = [] for word in document: words.append(stemmer.stem(word)) stemmed.append(words) return stemmed
def preprocessing(dataset): stemmer = StemmerFactory().create_stemmer() stopwords = StopWordRemoverFactory().create_stop_word_remover() for row in dataset: row['message'] = row.get('message').casefold() row['message'] = re.sub(r"[0-9]", "", row.get('message')) row['message'] = re.sub('[' + string.punctuation + ']', "", row.get('message')) row['message_stopwords'] = stopwords.remove(row['message']) row['message_stemmed'] = stemmer.stem(row['message_stopwords']) row['message_tokenized'] = word_tokenize(row['message_stemmed'])
def clean(doc): # menghilangkan kata tidak penting stop_free = " ".join([i for i in doc.lower().split() if i not in stopword]) # menghilangkan tanda baca punc_free = ''.join(ch for ch in stop_free if ch not in punctuation) # menjadikan ke kata dasar stemmer = StemmerFactory().create_stemmer() normalized = stemmer.stem(punc_free) # menghilangkan angka processed = re.sub(r"\d+", "", normalized) # membuat satu dokumen menjadi array berisi tiap kata y = processed.split() return y
def __init__(self, input, file_location): data = self.dataFromFile(file_location) stopword = StopWordRemoverFactory().create_stop_word_remover() stemmer = StemmerFactory().create_stemmer() input = stopword.remove(input.lower()) input = stemmer.stem(input) valid = 0 for i in range(len(data)): kal = stopword.remove(data[i][0].lower()) kal = stemmer.stem(kal) if (self.bm(input.lower(), kal.lower()) != -1): if (valid == 0): percent = len(input) * 100 / len(kal) # print("Confidence1 : " + str(percent)) if (percent > 80): self.answere = data[i][1] valid = 1 else: if valid == 0: if (self.bm2(input.lower(), kal.lower()) >= 80): # print("Confidence2 : " + str(bm2(input.lower(), kal.lower()))) self.answere = data[i][1] valid = 1
def clean_text(self, data): stopword = StopWordRemoverFactory().create_stop_word_remover() stemmer = StemmerFactory().create_stemmer() data = re.sub('[^a-zA-Z]',' ', str(data).lower()) data = re.sub('\byok\b |\byuk\b', 'ayo', data) data = re.sub('\bmager\b', 'males', data) data = re.sub('\bmalas\b', 'males', data) data = re.sub('\bmls\b', 'males', data) data = re.sub('\bkuy\b', 'yuk', data) data = re.sub('\borg\b', 'orang', data) data = re.sub('\bjg\b', 'juga', data) data = re.sub('\budh\b', 'sudah', data) data = re.sub('\bmangat\b', 'semangat', data) data = re.sub('\bcemungut\b', 'semangat', data) data = re.sub('\bgas\b', 'yuk', data) data = re.sub('\benakeun\b', 'enak', data) data = re.sub('\bnaek\b', 'naik', data) data = re.sub('\bmmg\b', 'memang', data) data = re.sub('\bga\b', 'engga', data) data = re.sub('\bengga\b', 'tidak', data) data = re.sub('\bttg\b', 'tentang', data) data = re.sub('\brush hour\b', 'jam sibuk', data) data = re.sub('\bku\b', 'aku', data) data = re.sub('\bgak\b', 'tidak', data) data = re.sub('\bdgn\b', 'dengan', data) data = re.sub('\bbailk\b', 'pulang', data) data = re.sub('\bgatau\b', 'tidak tahu', data) data = re.sub('\bbat\b', 'banget', data) data = re.sub('\bampe\b', 'sampai', data) data = re.sub('\blg\b', 'sedang', data) data = re.sub('\banjay\b', 'asik', data) data = re.sub('\banjg\b', 'anjing', data) data = re.sub('\banjiing\b', 'anjing', data) data = re.sub('\bantum\b', 'kamu', data) data = re.sub('\basiq\b |\basyique\b |\basik\b', 'asyik', data) data = re.sub('\bbgt\b |\bbanget\b |\bbanged\b', 'sangat', data) data = re.sub('\bribet\b', 'repot', data) data = data.split() data = ' '.join(data) #setelah ngeganti baru ilangin stopword dan imbuhan kata dibawah ini #sastrawi remove stopwords data = stopword.remove(data) #stopword nya udah di di provide sastrawi #sastrawi stemming data = stemmer.stem(data) return data
def preprocess_text(input): #lowercase all character in the text text = input[0] text = text.lower() #remove punctuation text = text.translate(str.maketrans("","",string.punctuation)) #remove leading and trailing whitespace text = text.strip() #remove StopWord stopword = StopWordRemoverFactory().create_stop_word_remover() text = stopword.remove(text) #stemming stemmer = StemmerFactory().create_stemmer() text = stemmer.stem(text) return text
class Preprocessor(): def __init__(self): self.stopwords = StopWordRemoverFactory().get_stop_words() self.stemmer = StemmerFactory().create_stemmer() def stemming(self, words): return self.stemmer.stem(words) def tokenizing(self, str, delimiter=" "): return str.split(delimiter) def preprocess(self, words): return [ token for token in self.tokenizing(self.stemming(words)) if token not in self.stopwords ]
def extract_text(extracted_path, id_wiki, stem): if os.path.isfile(extracted_path): return None if stem: print( 'Warning : Using stemmer could slow down the extracting progress') stemmer = StemmerFactory().create_stemmer() with open(extracted_path, 'w') as f: i = 0 for text in id_wiki.get_texts(): text = ' '.join(text) text = stemmer.stem(text) if stem else text f.write(text + '\n') i += 1 if i % (10 if stem else 1000) == 0: print(str(i), 'articles processed') print('total:', str(i)) return None
class Preprocess: def __init__(self): self.stemmer = StemmerFactory().create_stemmer() self.remover = StopWordRemoverFactory().create_stop_word_remover() def preprocess(self, text): # # 1 stemming text_stem = self.stemmer.stem(text) # # # 2 hapus stop words text_clean = self.remover.remove(text_stem) # # # 3 tokenization # # 3.1 lowercase lowercase = text_clean.lower() preprocessed_text = lowercase.translate(None, string.punctuation).split() return preprocessed_text
def respond(strg): levenshtein = Levenshtein() stemmer = StemmerFactory().create_stemmer() stopwords = StopWordRemoverFactory().create_stop_word_remover() kategori = model.predict([strg]) txt = stopwords.remove(strg) txt = stemmer.stem(txt) best = 1000 res = [] for words in dataset: if (words['category'] == kategori): distance = levenshtein.distance(txt, words['message_stemmed']) if (distance < best): best = distance res = words return res['respond']
class SimpleIndonesianPreprocessor(BaseEstimator, TransformerMixin): """ Simple Indonesian text preprocessor """ def __init__(self, stem=True, stopwords=True, verbose=True): self.stemmer = StemmerFactory().create_stemmer() if stem else None self.stopwords = [] if stopwords: with open(STOPWORDS_FILE, 'r') as f: self.stopwords = f.read().splitlines() self.verbose = verbose def fit(self, X, y=None): return self def inverse_transform(self, X): return [" ".join(doc) for doc in X] def transform(self, X): results = [] if self.verbose: print('Preprocessing..') bar = progressbar.ProgressBar() for doc in bar(X): results.append(list(self.tokenize(doc))) return results else: return [list(self.tokenize(doc)) for doc in X] def tokenize(self, document): if self.stemmer: # stem and split by whitespaces for token in self.stemmer.stem(document).split(): if token not in self.stopwords: yield token else: for token in document.lower().split(): if token not in self.stopwords: yield token
class NERFeatureExtractor: def read_label_file(self, filename): return open(filename).read().split('\n') def __init__(self, iob_predictor): self.iob_predictor = iob_predictor self.stemmer = StemmerFactory().create_stemmer() self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') self.label_words = self.read_label_file('label-words.txt') self.label_posses = self.read_label_file('label-posses.txt') self.label_lemmas = self.read_label_file('label-lemmas.txt') self.label_iob_feature = self.read_label_file('label-iob_feature.txt') self.label_iob_classes = self.read_label_file('label-iob_classes.txt') def getPOSTag(self, _temporary_tokens): strin = [] for token_tag in _temporary_tokens: strin.append(unicode(token_tag.decode('utf-8'))) return [(token.encode('ascii', 'ignore'), tag.encode('ascii', 'ignore')) for (token, tag) in self.TAGGER3.tag_sents([strin])[0]] def features(self, tokens, index, history): # print history # print tokens """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # Pad the sequence with placeholders tokens = [ ('[START2]', '[START2]'), ('[START1]', '[START1]') ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = prevword == prevword.capitalize() nextcapitalized = prevword[0] in string.ascii_uppercase return [ word, str(self.stemmer.stem(word)), str(pos), str(allascii), str(nextword), str(self.stemmer.stem(nextword)), str(nextpos), str(nextnextword), str(nextnextpos), str(prevword), str(self.stemmer.stem(prevword)), str(prevpos), str(prevprevword), str(prevprevpos), str(previob), str(contains_dash), str(contains_dot), str(allcaps), str(capitalized), str(prevallcaps), str(prevcapitalized), str(nextallcaps), str(nextcapitalized) ] def normalizeFeature(self, featx): out = [] if featx[0] in self.label_words: out.append(self.label_words.index(featx[0])) else: out.append(-1) if featx[1] in self.label_lemmas: out.append(self.label_lemmas.index(featx[1])) else: out.append(-1) if featx[2] in self.label_posses: out.append(self.label_posses.index(featx[2])) else: out.append(-1) out.append(1 if featx[3] else 0) if featx[4] in self.label_words: out.append(self.label_words.index(featx[4])) else: out.append(-1) if featx[5] in self.label_lemmas: out.append(self.label_lemmas.index(featx[5])) else: out.append(-1) if featx[6] in self.label_posses: out.append(self.label_posses.index(featx[6])) else: out.append(-1) if featx[7] in self.label_words: out.append(self.label_words.index(featx[7])) else: out.append(-1) if featx[8] in self.label_posses: out.append(self.label_posses.index(featx[8])) else: out.append(-1) if featx[9] in self.label_words: out.append(self.label_words.index(featx[9])) else: out.append(-1) if featx[10] in self.label_lemmas: out.append(self.label_lemmas.index(featx[10])) else: out.append(-1) if featx[11] in self.label_posses: out.append(self.label_posses.index(featx[11])) else: out.append(-1) if featx[12] in self.label_words: out.append(self.label_words.index(featx[12])) else: out.append(-1) if featx[13] in self.label_posses: out.append(self.label_posses.index(featx[13])) else: out.append(-1) if featx[14] in self.label_iob_feature: out.append(self.label_iob_feature.index(featx[14])) else: out.append(-1) out.append(1 if featx[15] else 0) out.append(1 if featx[16] else 0) out.append(1 if featx[17] else 0) out.append(1 if featx[18] else 0) out.append(1 if featx[19] else 0) out.append(1 if featx[20] else 0) out.append(1 if featx[21] else 0) out.append(1 if featx[22] else 0) return out def parseEntityName(self, _sent=""): tokens = self.getPOSTag(_sent.split()) history = [] self.res_all = [] last_feature = [] for i in range(len(tokens)): last_feature = self.features(tokens, i, history) iob_res = self.iob_predictor([self.normalizeFeature(last_feature) ])[0] history.append(iob_res) self.res_all.append((tokens[i], self.label_iob_classes[iob_res]))
class Stemmer: def __init__(self): self.stemmer = StemmerFactory().create_stemmer() def stem(self, text): return self.stemmer.stem(text)
class Preprocess: def __init__(self, preprocessing_dataset=None): self.tokenizer = TweetTokenizer() self.stop_words = dictionary.get_stop_words() self.base_words = dictionary.get_base_words() self.slang_words = dictionary.get_slang_words() self.stemmer = StemmerFactory().create_stemmer() self.preprocessing_dataset = preprocessing_dataset def case_folding(self, document): document = document.lower() return document def clean(self, document): document = re.sub( r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', document) # URLs document = re.sub(r'RT', '', document) # Retweet document = re.sub(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)', '', document) # Hashtag document = re.sub(r'(?:@[\w_]+)', '', document) # Mention document = re.sub(r'[^\x00-\x7F]+', '', document) # Unicode document = re.sub(r'rt', '', document) # Retweet document = re.sub(r'(?:[:=;][oO\-]?[D\)\]\(\]/\\OpP])', '', document) # Emoticon document = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', '', document) # Special Char document = re.sub(r'[\n\t\r]+', '', document) # Remove linebreak, tab, return return document def tokenize(self, document): tokenized_document = self.tokenizer.tokenize(document) return tokenized_document def stopword_removal(self, document): document = [word for word in document if word not in self.stop_words] return document def stem(self, document): document = self.stemmer.stem(document) return document def slang_word_correction(self, document): pattern = re.compile(r'\b(' + '|'.join(self.slang_words.keys()) + r')\b') document = pattern.sub(lambda x: self.slang_words[x.group()], document) return document def base_word_check(self, document): document = [word for word in document if word in self.base_words] return document def preprocess(self, document): document = self.case_folding(document) document = self.clean(document) document = self.slang_word_correction(document) document = self.stem(document) document = re.sub(r'[^a-zA-Z ]', '', document) # Special Char document = self.tokenize(document) document = self.stopword_removal(document) document = self.base_word_check(document) return document def list_to_string(self, document): str = ' '.join(document) return str def save_preprocessed_text(self, file_name, document): with open(file_name, 'w') as outfile: json.dump(document, outfile, indent=4) url = 'https://unikom-sentiment-services.azurewebsites.net/upload-ps' files = {'json': open(file_name, 'rb')} request = requests.post(url, files=files) if (request.status_code == 200): return True return False def load_preprocessed_text(self): collection = self.preprocessing_dataset.find({}) if collection.count() == 0: return False for preprocessing_data in collection: words_vocabulary = preprocessing_data['data'][0] classes = preprocessing_data['data'][1] return words_vocabulary, classes
class HadistRetrieval: def __init__(self): self.stopwords = stopwords self.stemmer = StemmerFactory().create_stemmer() self.hadist = hadist vectorizer = TfidfVectorizer() self.X = vectorizer.fit_transform(self.hadist.Processed) self.features = vectorizer.get_feature_names() def _text_lower(self, text: str) -> str: return text.lower() def _remove_entities(self, text: str) -> str: return re.sub(r'\[[^]]*\]', '', text) def _case_folding(self, text: str) -> str: return re.sub(r'[^a-z]', ' ', re.sub("'", '', text)) def _stemming(self, text: str) ->str: return self.stemmer.stem(text) def _stopwords_removal(self, text: str) -> str: texts_token = text.split() not_stopword = [] for token in texts_token: if token not in self.stopwords: not_stopword.append(token) return ' '.join(not_stopword) def _preprocessing(self, text: str) -> str: tx_lower = self._text_lower(text) tx_remove_entities = self._remove_entities(tx_lower) tx_case_folding = self._case_folding(tx_remove_entities) tx_stemming = self._stemming(tx_case_folding) return self._stopwords_removal(tx_stemming) def retrieve(self, sentence: str, n: int = 5) -> List[Mapping[str, str]]: sent_prep = self._preprocessing(sentence) query = sent_prep.split() res = np.zeros(self.X.shape[0]) not_in_corpus = [] output: List[Mapping[str, str]] = [] for keyword in query: try: res += self.X.toarray()[:,self.features.index(keyword)] except: not_in_corpus.append(keyword) res = np.zeros(self.X.shape[0]) top_idx = np.argsort(-res)[:n] if not sum(res) > 0: raise ValueError('dindt match, someting wrong') for i in range(len(top_idx)): # res[top_idx[i]] output.append({ 'source': self.hadist.iloc[top_idx[i]][4], 'text': self.hadist.iloc[top_idx[i]][2] }) return output
def stem(data): new_data = data.copy() stemmer = StemmerFactory().create_stemmer() return list(map(lambda s: stemmer.stem(s), new_data))
def stemming(self, token): stemmer = StemmerFactory().create_stemmer() stemmed = [stemmer.stem(' '.join(row)).split() for row in token] self.print_arr("Setelah Stemming:", stemmed) return stemmed
class TextSummarizer: def __init__(self, title: str, plot: str, human_synopsis: str): self.title = title self.plot = plot self.human_synopsis = human_synopsis self.stopwords = StopWordRemoverFactory().create_stop_word_remover() self.stemmer = StemmerFactory().create_stemmer() def __text_to_sentences(self, text: str) -> List[str]: regex = re.compile('\.\n\n|\.\n|\. |\.$') sentences = regex.split(text) return sentences def __stem_sentence(self, sentence: str) -> str: return self.stemmer.stem(sentence) def __stop_word_removal(self, words: List[str]) -> List[str]: temp_words = [] for word in words: if word.lower() in self.title.lower(): temp_words.append(word) else: temp = self.stopwords.remove(word) if temp: temp_words.append(temp) return temp_words def __preprocess_text(self, text: str) -> tuple: temp_sentences = self.__text_to_sentences(text) sentences = [] preprocessed_sentences = [] for sentence in temp_sentences: if len(sentence) < 2: continue stemmed_sentence = self.__stem_sentence(sentence.lower()) tokenized_sentence = nltk.tokenize.word_tokenize(stemmed_sentence) removed_stop_word_sentence = self.__stop_word_removal( tokenized_sentence) if len(removed_stop_word_sentence) < 2: continue sentences.append(sentence) preprocessed_sentences.append(removed_stop_word_sentence) return sentences, preprocessed_sentences def __sentence_similarity(self, sent1, sent2): """ calculate the similarity between sentence! return distance between sentences """ sent1 = [w.lower() for w in sent1] sent2 = [w.lower() for w in sent2] all_words = list(set(sent1 + sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # build the vector for the first sentence for w in sent1: vector1[all_words.index(w)] += 1 # build the vector for the second sentence for w in sent2: vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2) def __build_similarity_matrix(self, sentences): """ make a matrix to plot the similarity between sentences in a file return matrix """ # Create an empty similarity matrix similarity_matrix = np.zeros((len(sentences), len(sentences))) for idx1 in range(len(sentences)): for idx2 in range(len(sentences)): if idx1 == idx2: # ignore if both are same sentences continue similarity_matrix[idx1][idx2] = self.__sentence_similarity( sentences[idx1], sentences[idx2]) return similarity_matrix def summarize(self, top_n=5): summarize_text = [] # Step 1 - text preprocessing plot_sentences, plot_pre_sentences = self.__preprocess_text(self.plot) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = self.__build_similarity_matrix( plot_pre_sentences) print(sentence_similarity_martix) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array( sentence_similarity_martix) plot_scores = nx.pagerank(sentence_similarity_graph) # Step 4 - Sort the rank and pick top sentences ranked_sentence = [] for i in range(len(plot_scores)): ranked_sentence.append([plot_scores[i], plot_sentences[i], i]) ranked_sentence.sort(key=lambda x: x[0], reverse=True) top_n = min(top_n, len(plot_sentences)) summary = ranked_sentence[0:top_n] summary.sort(key=lambda x: x[2]) summary = [i[1] for i in summary] summarize_text = "" for i in range(top_n): summarize_text += "".join(summary[i]) + ". " # Step 5 - Offcourse, output the summarize texr return summarize_text @staticmethod def generate_from_file(title, plotfilepath, synopsisfilepath): plot = "" synopsis = "" with open(plotfilepath, "r") as plot_file: plot = plot_file.read() with open(synopsisfilepath, "r") as synopsis_file: synopsis = synopsis_file.read() ts = TextSummarizer(title, plot, synopsis) return ts.summarize()
while (True): print("Enter query keyword:") init_query = input() query = init_query # without query expansion print("==== Without query expansion ====") query = query.lower() remove_punctuation_map = dict( (ord(char), None) for char in string.punctuation) query = query.translate(remove_punctuation_map) query = stopword.remove(query) query = query.split() query = [stemmer.stem(x) for x in query] print("Query used: " + ' '.join(query)) # process the query print("Processing query...") max_result = [] x = [' '.join(query)] paper_tfidf = vectorizer.fit_transform(x + processed_paper) q = paper_tfidf[0] result = cosine_similarity(paper_tfidf, q) idx = np.argsort(-result, axis=0).flatten() final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0] max_result += final max_result = sorted(max_result, key=lambda x: x[1], reverse=True) set_result = set()
def index(hashs, terms): for word in terms: if word in hashs: hashs[word] += 1 else: hashs[word] = 1 print('Indexing ...') for path in sorted(IN_DIR.glob('*/*.html')): with open(path.resolve(), 'r', encoding='utf-8') as file: df[path.name] = dict() content = get_text(['title', 'top', 'middle', 'bottom'], file.read()) content = content.translate(str.maketrans('','', punctuation)) content = stopword.remove(content) terms = stemmer.stem(content.lower()).split() index(df[path.name], terms) index(tf, terms) print('Indexing done!\n') print('Calculating idf for terms...') for term, freq in tf.items(): df_i = 0 for doc, tf_doc in df.items(): df_i += 1 if term in tf_doc else 0 idf[term] = (1 + math.log2(len(df)/df_i)) if df_i != 0 else 1 print('Calculated!\n') with open(BASE_DIR / 'words_score.txt', 'w', encoding='utf-8') as file: print('Writing words score to text file ...')
docs_x = [] docs_y = [] for intent in data["intents"]: #print(intent) for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) print(wrds) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent["tag"]) if intent["tag"] not in labels: labels.append(intent["tag"]) words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = sorted(list(set(words))) print(words) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_x): bag = [] wrds = [stemmer.stem(w) for w in doc]
class Normalizer: id_mapper = None en_mapper = None id_words = None en_words = None def __init__(self, id_mapper, en_mapper, id_words, \ en_words, contracted_words_mapper): self.id_mapper = id_mapper self.en_mapper = en_mapper self.id_words = id_words self.en_words = en_words self.contracted_words_mapper = contracted_words_mapper self.en_stemmer = PorterStemmer() self.id_stemmer = StemmerFactory().create_stemmer() def _lookup_id(self, token): stemmed_token = self.id_stemmer.stem(token) if self.id_words and (stemmed_token in \ self.id_words or token in self.id_words): return token if self.id_mapper and token in self.id_mapper: return self.id_mapper[token] return None def _lookup_en(self, token): stemmed_token = self.en_stemmer.stem(token) if self.en_words and (stemmed_token in \ self.en_words or token in self.en_words): return token if self.en_mapper and token in self.en_mapper: return self.en_mapper[token] return None def _normalize_id(self, token): # mengatasi kasus nomor 6 if '2' in token and token.index('2') != 0: unnormalized_singulars = token.split('2') norm_sing_1 = self._lookup_id(unnormalized_singulars[0]) norm_sing_2 = self._lookup_id(unnormalized_singulars[1]) if not norm_sing_1: norm_sing_1 = unnormalized_singulars[0] if not norm_sing_2: norm_sing_2 = unnormalized_singulars[1] res = f'{norm_sing_1}-{norm_sing_1}' if norm_sing_2: res += norm_sing_2 return res return self._lookup_id(token) def _normalize_en(self, token): # Handle case #9, affix nge- if token.startswith('nge'): token = token[3:] return self._lookup_en(token) # Handle case #10, affix -nya elif token.endswith('nya'): token = token[:-3] norm_token = self._lookup_en(token) if norm_token is not None: return f'the {norm_token}' else: return f'the {token}' token = self._lookup_en(token) # Handle case #8, contracted words if token in self.contracted_words_mapper: token = self.contracted_words_mapper[token] return token def _normalize(self, word, lang): res = None if lang == 'id': res = self._normalize_id(word) if lang == 'en': res = self._normalize_en(word) if res: return res return None def normalize(self, token, lang='un'): if not is_word(token): return token token = remove_duplication(token).lower() # Handle multiple words if ' ' in token: unnormalized_singulars = token.split(' ') normalized_singulars = [] for sing in unnormalized_singulars: norm = self.normalize(sing, lang) normalized_singulars.append(norm if norm else sing) if normalized_singulars[0] == normalized_singulars[1]: return '-'.join(normalized_singulars) else: return ' '.join(normalized_singulars) possible_words = [token] possible_words.extend(self._generate_all_words(token)) for word in possible_words: res = self._normalize(word, lang) if res: return res return token def _generate_all_words(self, token): single_token = '' ids = [] for idx, char in enumerate(token): if idx > 0 and token[idx-1] == char: ids.append(len(single_token) - 1) else: single_token += char possible_words = [] max_iter = 1 << len(ids) len_ids = len(ids) for i in range(0, max_iter): bin_ids = '{:08b}'.format(i).lstrip('0') appeared_id = [] for idx, bin_id in enumerate(bin_ids): if bin_id == '1': appeared_id.append(ids[idx]) word = '' for idx, char in enumerate(single_token): word += char if idx in appeared_id: word += char possible_words.append(word) return possible_words
paper = pickle.load(f) print("Preprocessing..") factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() stemmer = StemmerFactory().create_stemmer() words = [] processed_paper = [] for num, x in enumerate(paper): text = x[2] text = text.lower() remove_punctuation_map = dict( (ord(char), None) for char in string.punctuation) text = text.translate(remove_punctuation_map) text = stopword.remove(text) text = text.split() text = [stemmer.stem(x) for x in text] processed_paper.append(' '.join(text)) text = list(set(text)) words += text print("Paper " + str(num + 1) + " done.") print("Done processing.") # save results to 'corpus/processed_paper.xlsx' print("Saving data to corpus/processed_paper.xlsx..") df = pd.DataFrame(processed_paper) df.to_excel('corpus/processed_paper.xlsx', header=False, index=False) print("Success.") # save results to 'pickle/processed_paper.pkl'
# Preporcess data for intent in intents['intents']: for pattern in intent['input_patterns']: # Word tokenization pattern = nltk.word_tokenize(pattern) # Case folding pattern = [word.lower() for word in pattern] # Filtering pattern = [ word for word in pattern if word not in stopwords and word.isalpha() and word not in string.punctuation ] # Stemming pattern = [stemmer.stem(word) for word in pattern] # insert to words list words.extend(pattern) # add doc in corpus documents.append((pattern, intent['tag'])) # add tag to class list if intent['tag'] not in classes: classes.append(intent['tag']) # Sort words and classes words = sorted(list(set(words))) classes = sorted(list(set(classes)))