def __init__(self): # init NLP self.nlp = Indonesian() # init flash text self.keyword_processor_slang_word = KeywordProcessor() self.keyword_processor_emoticon = KeywordProcessor() self.keyword_processor_meaning_text = KeywordProcessor() # init stemmer self.stemmer = StemmerFactory().create_stemmer() self.__init_flash_text_corpus() self.__init_custom_stop_word()
def __init__(self, root_folder: str, slang_file: str, vocab_file: str) -> None: self.root_folder = root_folder self.vocab_file = vocab_file self.nlp = Indonesian() self.x_train, self.y_train, self.x_dev, self.y_dev, self.x_test = self.load_dataset() if slang_file != "": self.fix_typo_and_store(slang_file, vocab_file)
def count_vocab(text: Union[str, List[str]], stopwords: set): nlp = Indonesian() if isinstance(text, str): text = [text] vocab = defaultdict(int) for _text in text: indonesian = nlp(_text) for token in indonesian: token_lowercase = token.text.lower() if token_lowercase not in stopwords and re.search( token_lowercase, string.punctuation) is None: vocab[token.text.lower()] += 1 return vocab
def __init__(self, mode: str, slang_file: str, vocab_file: str) -> None: self.mode = mode self.slang_file = slang_file self.vocab_file = vocab_file self.nlp = Indonesian() isc = pd.read_csv(slang_file) stof_df = isc[isc["in-dictionary"] == 1][["slang", "formal"]].groupby("slang")["formal"].apply( lambda x: list(x)[0]) self.slang_dict = stof_df.to_dict() with open(vocab_file, "r") as fvocab: self.vocab_list = [] for word in fvocab.readlines(): clean_word = word.strip() if clean_word not in self.vocab_list: self.vocab_list.append(clean_word)
def LoadStopWords(lang='en'): L = lang.lower().strip() if L == 'en' or L == 'english' or L == 'inggris': from spacy.lang.en import English as lemmatizer #lemmatizer = spacy.lang.en.English lemmatizer = lemmatizer() #lemmatizer = spacy.load('en') stops = set([t.strip() for t in LoadDocuments(file = 'data/stopwords_en.txt')[0]]) elif L == 'id' or L == 'indonesia' or L=='indonesian': from spacy.lang.id import Indonesian #lemmatizer = spacy.lang.id.Indonesian lemmatizer = Indonesian() stops = set([t.strip() for t in LoadDocuments(file = 'data/stopwords_id.txt')[0]]) else: print('Warning, language not recognized. Empty StopWords Given') stops = set(); lemmatizer = None return stops, lemmatizer
def infer(self, sentence, true_tags=None): self.model.eval() # tokenize sentence nlp = Indonesian() tokens = [token.text for token in nlp(sentence)] max_word_len = max([len(token) for token in tokens]) # transform to indices based on corpus vocab numericalized_tokens = [ self.data.word_field.vocab.stoi[token.lower()] for token in tokens ] numericalized_chars = [] char_pad_id = self.data.char_pad_idx for token in tokens: numericalized_chars.append( [self.data.char_field.vocab.stoi[char] for char in token] + [char_pad_id for _ in range(max_word_len - len(token))]) # find unknown words unk_idx = self.data.word_field.vocab.stoi[ self.data.word_field.unk_token] unks = [ t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx ] # begin prediction token_tensor = torch.as_tensor(numericalized_tokens) token_tensor = token_tensor.unsqueeze(-1).to(self.device) char_tensor = torch.as_tensor(numericalized_chars) char_tensor = char_tensor.unsqueeze(0).to(self.device) predictions, _ = self.model(token_tensor, char_tensor) # convert results to tags predicted_tags = [ self.data.tag_field.vocab.itos[t] for t in predictions[0] ] # print inferred tags max_len_token = max([len(token) for token in tokens] + [len('word')]) max_len_tag = max([len(tag) for tag in predicted_tags] + [len('pred')]) print( f"{'word'.ljust(max_len_token)}\t{'unk'.ljust(max_len_token)}\t{'pred tag'.ljust(max_len_tag)}" + ("\ttrue tag" if true_tags else "")) for i, token in enumerate(tokens): is_unk = "✓" if token in unks else "" print( f"{token.ljust(max_len_token)}\t{is_unk.ljust(max_len_token)}\t{predicted_tags[i].ljust(max_len_tag)}" + (f"\t{true_tags[i]}" if true_tags else "")) return tokens, predicted_tags, unks
def transform_json_to_conll(): nlp = Indonesian() file = "../data/processed/test/test.json" with open(file, "r") as f: annotations = json.load(f) random.seed(1339) random.shuffle(annotations) buffer_conll = { "val": "", "test": "" } for anno_i, annotation in enumerate(annotations): sorted_labels = sorted(annotation["labels"], key=lambda label: (label[0], label[1])) token_i = 0 curr_label = sorted_labels[token_i] if len(sorted_labels) > 0 else None tokens = nlp(annotation["text"]) for token in tokens: token_begin = token.idx token_end = token.idx + len(token.text) tag = "O" if curr_label and token_begin >= curr_label[0] and token_end <= curr_label[1]: tag = curr_label[2] if token_end == curr_label[1]: tag = f"L-{tag}" if token_begin > curr_label[0] else f"U-{tag}" if token_i < len(sorted_labels) - 1: token_i += 1 curr_label = sorted_labels[token_i] elif token_begin == curr_label[0]: tag = f"B-{tag}" else: tag = f"I-{tag}" buffer_conll["val" if anno_i <= len(annotations) // 2 else "test"] += token.text + "\t" + tag + "\n" buffer_conll["val" if anno_i <= len(annotations) // 2 else "test"] += "\n" with open("../input/val.tsv", "w") as f: f.write(buffer_conll["val"]) with open("../input/test.tsv", "w") as f: f.write(buffer_conll["test"])
from html import unescape from nltk import sent_tokenize from unidecode import unidecode from urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) from requests.packages.urllib3.exceptions import InsecureRequestWarning import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from gensim.models.ldamodel import LdaModel from gensim.models.coherencemodel import CoherenceModel from spacy.lang.id import Indonesian from nltk.tag import CRFTagger import spacy nlp_en = spacy.load("en_core_web_sm") nlp_id = Indonesian() ct = CRFTagger() # Language Model fTagger = 'data/all_indo_man_tag_corpus_model.crf.tagger' ct.set_model_file(fTagger) def NLPfilter(t, filters): # filters = set(['NN', 'NNP', 'NNS', 'NNPS', 'JJ']) tokens = nlp_id(t) tokens = [str(k) for k in tokens if len(k) > 2] hasil = ct.tag_sents([tokens]) return [k[0] for k in hasil[0] if k[1] in filters] def compute_coherence_values(dictionary, corpus,
class Preprocessing(object): def __init__(self): # init NLP self.nlp = Indonesian() # init flash text self.keyword_processor_slang_word = KeywordProcessor() self.keyword_processor_emoticon = KeywordProcessor() self.keyword_processor_meaning_text = KeywordProcessor() # init stemmer self.stemmer = StemmerFactory().create_stemmer() self.__init_flash_text_corpus() self.__init_custom_stop_word() def __init_flash_text_corpus(self): """ Init flash text corpus. """ # build slang word corpus slang_words_raw = Repository.get_slang_word() for word in slang_words_raw.values: self.keyword_processor_slang_word.add_keyword(word[0], word[1]) # build emoticon corpus emoticon_raw = constant.EMOTICON_LIST for key, values in emoticon_raw: for value in values: self.keyword_processor_emoticon.add_keyword(value, key) # build meaning word corpus meaning_words_raw = Repository.get_meaning_text() for word in meaning_words_raw.values: self.keyword_processor_meaning_text.add_keyword(word[0], word[1]) def __init_custom_stop_word(self): """ Custom stop word for chat message content. """ for stop_word in constant.STOP_WORD: self.nlp.vocab[stop_word].is_stop = True for stop_word in constant.EXC_STOP_WORD: self.nlp.vocab[stop_word].is_stop = False def cleaning(self, chat_message_list): """ Pre-processing the content from ChatMessage. :param chat_message_list: dirty content from list of ChatMessage. :return: list of ChatMessage. """ chat_message_list_temp = [] if chat_message_list: logger.info('Pre-processing started...') start_time = time.time() chat_message_list = self.remove_repeated_message_from_agent( chat_message_list) for chat_message in chat_message_list: logger.info(f'BEFORE -> {chat_message.content}') content = self.__preprocessing_flow(chat_message.content) logger.info(f'AFTER -> {content}') chat_message.content = content if content.strip(): chat_message_list_temp.append(chat_message) logger.info( f'Pre-processing finished. {time.time() - start_time} seconds') else: logger.info('No chat message yet.') return chat_message_list_temp def cleaning_with_pipe(self, chat_message_list): """ [DEPRECATED] Pre-processing the content from ChatMessage with multi threading from spaCy. :param chat_message_list: dirty content from list of ChatMessage. :return: list of ChatMessage. """ if chat_message_list: logger.info('Pre-processing started...') start_time = time.time() index = 0 chat_content_list = [ chat_message.content for chat_message in chat_message_list ] for content in self.nlp.pipe(chat_content_list, n_threads=cpu_count()): chat_message_list[index].content = self.__preprocessing_flow( content.text) index = index + 1 logger.info( f'Pre-processing finished. {time.time() - start_time} seconds') else: logger.info('No chat message yet.') return chat_message_list def __preprocessing_flow(self, content): """ Preprocessing flow. """ # normalize emoticon # content = PreprocessingUtilsV2.normalize_emoticon(content, self.keyword_processor_emoticon) content = str(content) # normalize url content = PreprocessingUtils.normalize_url(content) # remove url content = PreprocessingUtils.remove_url(content) # remove email content = PreprocessingUtils.remove_email(content) # remove digit number content = PreprocessingUtils.remove_digit_number(content) # case folding lower case content = PreprocessingUtils.case_folding_lowercase(content) # remove punctuation content = PreprocessingUtils.remove_punctuation(content) # remove repeated character content = PreprocessingUtils.remove_repeated_character(content) # normalize slang word content = PreprocessingUtilsV2.normalize_slang_word( content, self.keyword_processor_slang_word) # stemming, tokenize, remove stop word content = PreprocessingUtils.stemming(content, self.nlp, self.stemmer) # remove unused character content = PreprocessingUtils.remove_unused_character(content) # join negation word content = PreprocessingUtils.join_negation(content) # remove extra space between word content = PreprocessingUtils.remove_extra_space(content) # normalize word content = PreprocessingUtilsV2.normalize_meaning_word( content, self.keyword_processor_meaning_text) # remove stop word content = PreprocessingUtils.remove_stop_word(content, self.nlp) # TODO add another pre-processing if needed return content @staticmethod def identify_phrase(documents): """ documents : iterable of iterable of str """ bigram = Phraser( Phrases(documents, min_count=5, delimiter=b'_', threshold=1)) trigram = Phraser( Phrases(bigram[documents], min_count=5, delimiter=b'_', threshold=1)) for i in range(len(documents)): for token in bigram[documents[i]]: if '_' in token: documents[i].append(token) for token in trigram[documents[i]]: if '_' in token: documents[i].append(token) return documents @staticmethod def remove_repeated_message_from_agent(message_history_list): """ documents : removed repeated chat message if repeat more than constant.MESSAGE_TEMPLATE_MIN_COUNT""" message_template_list = [] message_history_list_temp = [] counter = collections.Counter() for chat_message in message_history_list: if chat_message.sender_role == constant.SENDER_ROLE_AGENT: counter[chat_message.content] += 1 for key, value in counter.items(): if value > constant.MESSAGE_TEMPLATE_MIN_COUNT: message_template_list.append(key) for chat_message in message_history_list: if chat_message.content not in message_template_list: message_history_list_temp.append(chat_message) return message_history_list_temp
def cleanText(T, fix={}, lang='id', lemmatizer=None, stops=set(), symbols_remove=False, min_charLen=0): pattern = re.compile( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) pattern1 = re.compile( r'pic.twitter.com/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) t = re.sub(pattern, ' ', T) #remove urls if any t = re.sub(pattern1, ' ', t) t = unescape(t) # html entities fix t = t.lower().strip() # lowercase t = unidecode(t) t = ''.join(''.join(s)[:1] for _, s in itertools.groupby(t)) # remove repetition t = sent_tokenize(t) # sentence segmentation. String to list for i, K in enumerate(t): if symbols_remove: K = re.sub(r'[^.,a-zA-Z0-9 \n\.]', ' ', K) K = K.replace(',', ' ').replace('.', ' ') K = ''.join(c for c in K if c not in punctuation) K = re.sub('\s+', ' ', K).strip() cleanList = [] if lang == 'en': lemmatizer = WordNetLemmatizer() listKata = word_tokenize(K) for token in listKata: if token in fix.keys(): token = fix[token] if lemmatizer: token = lemmatizer.lemmatize(token) if stops: if len(token) >= min_charLen and token not in stops: cleanList.append(token) else: if len(token) >= min_charLen: cleanList.append(token) t[i] = ' '.join(cleanList) else: lemmatizer = Indonesian() K = lemmatizer(K) listKata = [token.text for token in K] for token in listKata: if token in fix.keys(): token = fix[token] if lemmatizer: token = lemmatizer(token)[0].lemma_ token = stemmer.stem(token) if stops: if len(token) >= min_charLen and token not in stops: cleanList.append(token) else: if len(token) >= min_charLen: cleanList.append(token) t[i] = ' '.join(cleanList) return ' '.join(t)
for dirpath, dirs, files in os.walk(folderpath): for filename in fnmatch.filter(files, '*.txt'): filelist.append(dirpath + "/" + filename) return filelist def writedataa(list, thname): file = open("sentence_rep_{}.txt".format(thname), "w") for x in sorted(set(list)): # for x in list: # hasil = x.replace('"','').replace("#","").replace(" ","" ) file.write(x + "\n") file.close() nlp = Indonesian() def tokenize_and_stem(text): text = u'{}'.format(text) doc = nlp(text) stems = [t.lemma_ for t in doc] stems = [t.lower() for t in stems] return stems def tokenize_only(text): text = u'{}'.format(text) doc = nlp(text) stems = [t.text for t in doc] stems = [t.lower() for t in stems]
def __init__(self): self.nlp = Indonesian() self.nlp.Defaults.stop_words.update(genfromtxt('/media/faruq/FARUQ/PENS/semester6/datamining/program/django/ClassFormExample/newsletter/stopword.csv', dtype='|S18', delimiter=',')) self.vectorizer = TfidfVectorizer(tokenizer=self.__tokenizer, ngram_range=(1, 1))
def __init__(self, lemmatize: bool) -> None: self.lemmatize = lemmatize self.nlp = Indonesian()
def __init__(self): self.sentences: List[List[str]] = [] self.nlp: Language = Indonesian() self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
# import gensim from gensim.models.ldamodel import LdaModel from gensim.models import Phrases, TfidfModel from gensim import corpora from gensim.models.coherencemodel import CoherenceModel from pickle import dump from json import dumps from datetime import datetime from re import sub import warnings warnings.filterwarnings('ignore') # import spacy from spacy.lang.id import Indonesian, stop_words nlp = Indonesian() # use directly stopwords = stop_words.STOP_WORDS stopwords |= {"nya", "jurusan", "jurus", "the", "of"} def preprocessing(text): text = pre.remove_tag(text) #Remove Tag text = pre.remove_whitespace(text) #Remove Whitespace text = pre.lower(text) #Lower text = pre.remove_link(text) #Remove Link text = pre.alphabet_only(text) #Get Alphabet text = sub(r'sobat pintar', '', text) # sorry:( text = pre.remove_whitespace(text) #Remove Whitespace text = [token.text for token in nlp(text)] #Token text = pre.slang(text) text = [
def __init__(self): self._nlp = Indonesian()
import csv import os from nltk import word_tokenize from nltk.tokenize.treebank import TreebankWordDetokenizer import re from spacy.lang.id import Indonesian import spacy nlp_indonesia = Indonesian() # use directly nlp_indonesia = spacy.blank('id') # procedure clean noise dataset def preprocessing_text(text): text = text.lower() #text = re.sub('[^a-zA-Z0-9 .,?!]', '', text) text = re.sub(r'[^a-zA-Z\s.,?!]', u'', text, flags=re.UNICODE) # for r in ( # (" ku", " aku"), (" gw", " aku"), (" saya", " aku"), (" gue", " aku"), (" gua", " aku"), # (" anda", " kamu"), (" lu", " kamu"), (" kau", " kamu"), (" mu", " kamu"), # (" dia", " dia"), (" doi", " dia"), # (" kita", " kami"), # (" tak", " tidak"), (" engga", " tidak"), (" enggak", " tidak"), (" ga", " tidak"), (" gak", " tidak"), # (" ya", " iya"), (" yes", " iya"), (" yoi", " iya"), (" yah", " iya"), # (" hei", " hai"), (" hey", " hai"), (" halo", " hai"), (" hay", " hai") # ): # text = text.replace(*r) aku = ['ku', 'gw', 'saya', 'gue', 'gua'] kamu = ['anda', 'lu', 'kau', 'mu']