def __init__(self):
        # init NLP
        self.nlp = Indonesian()

        # init flash text
        self.keyword_processor_slang_word = KeywordProcessor()
        self.keyword_processor_emoticon = KeywordProcessor()
        self.keyword_processor_meaning_text = KeywordProcessor()

        # init stemmer
        self.stemmer = StemmerFactory().create_stemmer()

        self.__init_flash_text_corpus()
        self.__init_custom_stop_word()
Esempio n. 2
0
 def __init__(self, root_folder: str, slang_file: str, vocab_file: str) -> None:
     self.root_folder = root_folder
     self.vocab_file = vocab_file
     self.nlp = Indonesian()
     self.x_train, self.y_train, self.x_dev, self.y_dev, self.x_test = self.load_dataset()
     if slang_file != "":
         self.fix_typo_and_store(slang_file, vocab_file)
Esempio n. 3
0
def count_vocab(text: Union[str, List[str]], stopwords: set):
    nlp = Indonesian()
    if isinstance(text, str):
        text = [text]
    vocab = defaultdict(int)
    for _text in text:
        indonesian = nlp(_text)
        for token in indonesian:
            token_lowercase = token.text.lower()
            if token_lowercase not in stopwords and re.search(
                    token_lowercase, string.punctuation) is None:
                vocab[token.text.lower()] += 1
    return vocab
Esempio n. 4
0
 def __init__(self, mode: str, slang_file: str, vocab_file: str) -> None:
     self.mode = mode
     self.slang_file = slang_file
     self.vocab_file = vocab_file
     self.nlp = Indonesian()
     isc = pd.read_csv(slang_file)
     stof_df = isc[isc["in-dictionary"] == 1][["slang", "formal"]].groupby("slang")["formal"].apply(
         lambda x: list(x)[0])
     self.slang_dict = stof_df.to_dict()
     with open(vocab_file, "r") as fvocab:
         self.vocab_list = []
         for word in fvocab.readlines():
             clean_word = word.strip()
             if clean_word not in self.vocab_list:
                 self.vocab_list.append(clean_word)
Esempio n. 5
0
def LoadStopWords(lang='en'):
    L = lang.lower().strip()
    if L == 'en' or L == 'english' or L == 'inggris':
        from spacy.lang.en import English as lemmatizer
        #lemmatizer = spacy.lang.en.English
        lemmatizer = lemmatizer()
        #lemmatizer = spacy.load('en')
        stops =  set([t.strip() for t in LoadDocuments(file = 'data/stopwords_en.txt')[0]])
    elif L == 'id' or L == 'indonesia' or L=='indonesian':
        from spacy.lang.id import Indonesian
        #lemmatizer = spacy.lang.id.Indonesian
        lemmatizer = Indonesian()
        stops = set([t.strip() for t in LoadDocuments(file = 'data/stopwords_id.txt')[0]])
    else:
        print('Warning, language not recognized. Empty StopWords Given')
        stops = set(); lemmatizer = None
    return stops, lemmatizer
Esempio n. 6
0
 def infer(self, sentence, true_tags=None):
     self.model.eval()
     # tokenize sentence
     nlp = Indonesian()
     tokens = [token.text for token in nlp(sentence)]
     max_word_len = max([len(token) for token in tokens])
     # transform to indices based on corpus vocab
     numericalized_tokens = [
         self.data.word_field.vocab.stoi[token.lower()] for token in tokens
     ]
     numericalized_chars = []
     char_pad_id = self.data.char_pad_idx
     for token in tokens:
         numericalized_chars.append(
             [self.data.char_field.vocab.stoi[char] for char in token] +
             [char_pad_id for _ in range(max_word_len - len(token))])
     # find unknown words
     unk_idx = self.data.word_field.vocab.stoi[
         self.data.word_field.unk_token]
     unks = [
         t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx
     ]
     # begin prediction
     token_tensor = torch.as_tensor(numericalized_tokens)
     token_tensor = token_tensor.unsqueeze(-1).to(self.device)
     char_tensor = torch.as_tensor(numericalized_chars)
     char_tensor = char_tensor.unsqueeze(0).to(self.device)
     predictions, _ = self.model(token_tensor, char_tensor)
     # convert results to tags
     predicted_tags = [
         self.data.tag_field.vocab.itos[t] for t in predictions[0]
     ]
     # print inferred tags
     max_len_token = max([len(token) for token in tokens] + [len('word')])
     max_len_tag = max([len(tag) for tag in predicted_tags] + [len('pred')])
     print(
         f"{'word'.ljust(max_len_token)}\t{'unk'.ljust(max_len_token)}\t{'pred tag'.ljust(max_len_tag)}"
         + ("\ttrue tag" if true_tags else ""))
     for i, token in enumerate(tokens):
         is_unk = "✓" if token in unks else ""
         print(
             f"{token.ljust(max_len_token)}\t{is_unk.ljust(max_len_token)}\t{predicted_tags[i].ljust(max_len_tag)}"
             + (f"\t{true_tags[i]}" if true_tags else ""))
     return tokens, predicted_tags, unks
Esempio n. 7
0
def transform_json_to_conll():
    nlp = Indonesian()
    file = "../data/processed/test/test.json"
    with open(file, "r") as f:
        annotations = json.load(f)
    random.seed(1339)
    random.shuffle(annotations)
    buffer_conll = {
        "val": "",
        "test": ""
    }
    for anno_i, annotation in enumerate(annotations):
        sorted_labels = sorted(annotation["labels"], key=lambda label: (label[0], label[1]))
        token_i = 0
        curr_label = sorted_labels[token_i] if len(sorted_labels) > 0 else None
        tokens = nlp(annotation["text"])
        for token in tokens:
            token_begin = token.idx
            token_end = token.idx + len(token.text)
            tag = "O"
            if curr_label and token_begin >= curr_label[0] and token_end <= curr_label[1]:
                tag = curr_label[2]
                if token_end == curr_label[1]:
                    tag = f"L-{tag}" if token_begin > curr_label[0] else f"U-{tag}"
                    if token_i < len(sorted_labels) - 1:
                        token_i += 1
                        curr_label = sorted_labels[token_i]
                elif token_begin == curr_label[0]:
                    tag = f"B-{tag}"
                else:
                    tag = f"I-{tag}"
            buffer_conll["val" if anno_i <= len(annotations) // 2 else "test"] += token.text + "\t" + tag + "\n"
        buffer_conll["val" if anno_i <= len(annotations) // 2 else "test"] += "\n"
    with open("../input/val.tsv", "w") as f:
        f.write(buffer_conll["val"])
    with open("../input/test.tsv", "w") as f:
        f.write(buffer_conll["test"])
Esempio n. 8
0
from html import unescape
from nltk import sent_tokenize
from unidecode import unidecode
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from spacy.lang.id import Indonesian
from nltk.tag import CRFTagger
import spacy
nlp_en = spacy.load("en_core_web_sm")

nlp_id = Indonesian()
ct = CRFTagger()  # Language Model
fTagger = 'data/all_indo_man_tag_corpus_model.crf.tagger'
ct.set_model_file(fTagger)


def NLPfilter(t, filters):
    # filters = set(['NN', 'NNP', 'NNS', 'NNPS', 'JJ'])
    tokens = nlp_id(t)
    tokens = [str(k) for k in tokens if len(k) > 2]
    hasil = ct.tag_sents([tokens])
    return [k[0] for k in hasil[0] if k[1] in filters]


def compute_coherence_values(dictionary,
                             corpus,
class Preprocessing(object):
    def __init__(self):
        # init NLP
        self.nlp = Indonesian()

        # init flash text
        self.keyword_processor_slang_word = KeywordProcessor()
        self.keyword_processor_emoticon = KeywordProcessor()
        self.keyword_processor_meaning_text = KeywordProcessor()

        # init stemmer
        self.stemmer = StemmerFactory().create_stemmer()

        self.__init_flash_text_corpus()
        self.__init_custom_stop_word()

    def __init_flash_text_corpus(self):
        """ Init flash text corpus. """
        # build slang word corpus
        slang_words_raw = Repository.get_slang_word()
        for word in slang_words_raw.values:
            self.keyword_processor_slang_word.add_keyword(word[0], word[1])

        # build emoticon corpus
        emoticon_raw = constant.EMOTICON_LIST
        for key, values in emoticon_raw:
            for value in values:
                self.keyword_processor_emoticon.add_keyword(value, key)

        # build meaning word corpus
        meaning_words_raw = Repository.get_meaning_text()
        for word in meaning_words_raw.values:
            self.keyword_processor_meaning_text.add_keyword(word[0], word[1])

    def __init_custom_stop_word(self):
        """ Custom stop word for chat message content. """

        for stop_word in constant.STOP_WORD:
            self.nlp.vocab[stop_word].is_stop = True

        for stop_word in constant.EXC_STOP_WORD:
            self.nlp.vocab[stop_word].is_stop = False

    def cleaning(self, chat_message_list):
        """
        Pre-processing the content from ChatMessage.

        :param chat_message_list: dirty content from list of ChatMessage.
        :return: list of ChatMessage.
        """
        chat_message_list_temp = []

        if chat_message_list:
            logger.info('Pre-processing started...')
            start_time = time.time()
            chat_message_list = self.remove_repeated_message_from_agent(
                chat_message_list)
            for chat_message in chat_message_list:
                logger.info(f'BEFORE -> {chat_message.content}')
                content = self.__preprocessing_flow(chat_message.content)
                logger.info(f'AFTER -> {content}')
                chat_message.content = content
                if content.strip():
                    chat_message_list_temp.append(chat_message)

            logger.info(
                f'Pre-processing finished. {time.time() - start_time} seconds')
        else:
            logger.info('No chat message yet.')

        return chat_message_list_temp

    def cleaning_with_pipe(self, chat_message_list):
        """
        [DEPRECATED]
        Pre-processing the content from ChatMessage with multi threading from spaCy.

        :param chat_message_list: dirty content from list of ChatMessage.
        :return: list of ChatMessage.
        """

        if chat_message_list:
            logger.info('Pre-processing started...')
            start_time = time.time()
            index = 0

            chat_content_list = [
                chat_message.content for chat_message in chat_message_list
            ]
            for content in self.nlp.pipe(chat_content_list,
                                         n_threads=cpu_count()):
                chat_message_list[index].content = self.__preprocessing_flow(
                    content.text)
                index = index + 1

            logger.info(
                f'Pre-processing finished. {time.time() - start_time} seconds')
        else:
            logger.info('No chat message yet.')

        return chat_message_list

    def __preprocessing_flow(self, content):
        """ Preprocessing flow. """
        # normalize emoticon
        # content = PreprocessingUtilsV2.normalize_emoticon(content, self.keyword_processor_emoticon)

        content = str(content)

        # normalize url
        content = PreprocessingUtils.normalize_url(content)

        # remove url
        content = PreprocessingUtils.remove_url(content)

        # remove email
        content = PreprocessingUtils.remove_email(content)

        # remove digit number
        content = PreprocessingUtils.remove_digit_number(content)

        # case folding lower case
        content = PreprocessingUtils.case_folding_lowercase(content)

        # remove punctuation
        content = PreprocessingUtils.remove_punctuation(content)

        # remove repeated character
        content = PreprocessingUtils.remove_repeated_character(content)

        # normalize slang word
        content = PreprocessingUtilsV2.normalize_slang_word(
            content, self.keyword_processor_slang_word)

        # stemming, tokenize, remove stop word
        content = PreprocessingUtils.stemming(content, self.nlp, self.stemmer)

        # remove unused character
        content = PreprocessingUtils.remove_unused_character(content)

        # join negation word
        content = PreprocessingUtils.join_negation(content)

        # remove extra space between word
        content = PreprocessingUtils.remove_extra_space(content)

        # normalize word
        content = PreprocessingUtilsV2.normalize_meaning_word(
            content, self.keyword_processor_meaning_text)

        # remove stop word
        content = PreprocessingUtils.remove_stop_word(content, self.nlp)

        # TODO add another pre-processing if needed

        return content

    @staticmethod
    def identify_phrase(documents):
        """ documents : iterable of iterable of str """
        bigram = Phraser(
            Phrases(documents, min_count=5, delimiter=b'_', threshold=1))
        trigram = Phraser(
            Phrases(bigram[documents],
                    min_count=5,
                    delimiter=b'_',
                    threshold=1))

        for i in range(len(documents)):
            for token in bigram[documents[i]]:
                if '_' in token:
                    documents[i].append(token)
            for token in trigram[documents[i]]:
                if '_' in token:
                    documents[i].append(token)
        return documents

    @staticmethod
    def remove_repeated_message_from_agent(message_history_list):
        """ documents : removed repeated chat message if repeat more than constant.MESSAGE_TEMPLATE_MIN_COUNT"""
        message_template_list = []
        message_history_list_temp = []
        counter = collections.Counter()

        for chat_message in message_history_list:
            if chat_message.sender_role == constant.SENDER_ROLE_AGENT:
                counter[chat_message.content] += 1

        for key, value in counter.items():
            if value > constant.MESSAGE_TEMPLATE_MIN_COUNT:
                message_template_list.append(key)

        for chat_message in message_history_list:
            if chat_message.content not in message_template_list:
                message_history_list_temp.append(chat_message)

        return message_history_list_temp
def cleanText(T,
              fix={},
              lang='id',
              lemmatizer=None,
              stops=set(),
              symbols_remove=False,
              min_charLen=0):
    pattern = re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    pattern1 = re.compile(
        r'pic.twitter.com/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    t = re.sub(pattern, ' ', T)  #remove urls if any
    t = re.sub(pattern1, ' ', t)
    t = unescape(t)  # html entities fix
    t = t.lower().strip()  # lowercase
    t = unidecode(t)
    t = ''.join(''.join(s)[:1]
                for _, s in itertools.groupby(t))  # remove repetition
    t = sent_tokenize(t)  # sentence segmentation. String to list
    for i, K in enumerate(t):
        if symbols_remove:
            K = re.sub(r'[^.,a-zA-Z0-9 \n\.]', ' ', K)
            K = K.replace(',', ' ').replace('.', ' ')
            K = ''.join(c for c in K if c not in punctuation)
            K = re.sub('\s+', ' ', K).strip()

        cleanList = []
        if lang == 'en':
            lemmatizer = WordNetLemmatizer()
            listKata = word_tokenize(K)
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]
                if lemmatizer:
                    token = lemmatizer.lemmatize(token)
                if stops:
                    if len(token) >= min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token) >= min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
        else:
            lemmatizer = Indonesian()
            K = lemmatizer(K)
            listKata = [token.text for token in K]
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]

                if lemmatizer:
                    token = lemmatizer(token)[0].lemma_
                    token = stemmer.stem(token)
                if stops:
                    if len(token) >= min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token) >= min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
    return ' '.join(t)
Esempio n. 11
0
    for dirpath, dirs, files in os.walk(folderpath):
        for filename in fnmatch.filter(files, '*.txt'):
            filelist.append(dirpath + "/" + filename)
    return filelist


def writedataa(list, thname):
    file = open("sentence_rep_{}.txt".format(thname), "w")
    for x in sorted(set(list)):
        # for x in list:
        # hasil = x.replace('"','').replace("#","").replace("&nbsp;","" )
        file.write(x + "\n")
    file.close()


nlp = Indonesian()


def tokenize_and_stem(text):
    text = u'{}'.format(text)
    doc = nlp(text)
    stems = [t.lemma_ for t in doc]
    stems = [t.lower() for t in stems]
    return stems


def tokenize_only(text):
    text = u'{}'.format(text)
    doc = nlp(text)
    stems = [t.text for t in doc]
    stems = [t.lower() for t in stems]
 def __init__(self):
     self.nlp = Indonesian()
     self.nlp.Defaults.stop_words.update(genfromtxt('/media/faruq/FARUQ/PENS/semester6/datamining/program/django/ClassFormExample/newsletter/stopword.csv', dtype='|S18', delimiter=','))
     self.vectorizer = TfidfVectorizer(tokenizer=self.__tokenizer, ngram_range=(1, 1))
Esempio n. 13
0
 def __init__(self, lemmatize: bool) -> None:
     self.lemmatize = lemmatize
     self.nlp = Indonesian()
Esempio n. 14
0
 def __init__(self):
     self.sentences: List[List[str]] = []
     self.nlp: Language = Indonesian()
     self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
Esempio n. 15
0
# import gensim
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases, TfidfModel
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

from pickle import dump
from json import dumps
from datetime import datetime
from re import sub
import warnings
warnings.filterwarnings('ignore')

# import spacy
from spacy.lang.id import Indonesian, stop_words
nlp = Indonesian()  # use directly
stopwords = stop_words.STOP_WORDS
stopwords |= {"nya", "jurusan", "jurus", "the", "of"}


def preprocessing(text):
    text = pre.remove_tag(text)  #Remove Tag
    text = pre.remove_whitespace(text)  #Remove Whitespace
    text = pre.lower(text)  #Lower
    text = pre.remove_link(text)  #Remove Link
    text = pre.alphabet_only(text)  #Get Alphabet
    text = sub(r'sobat pintar', '', text)  # sorry:(
    text = pre.remove_whitespace(text)  #Remove Whitespace
    text = [token.text for token in nlp(text)]  #Token
    text = pre.slang(text)
    text = [
Esempio n. 16
0
 def __init__(self):
     self._nlp = Indonesian()
Esempio n. 17
0
import csv
import os
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import re
from spacy.lang.id import Indonesian
import spacy

nlp_indonesia = Indonesian()  # use directly
nlp_indonesia = spacy.blank('id')


# procedure clean noise dataset
def preprocessing_text(text):
    text = text.lower()
    #text = re.sub('[^a-zA-Z0-9 .,?!]', '', text)
    text = re.sub(r'[^a-zA-Z\s.,?!]', u'', text, flags=re.UNICODE)

    #    for r in (
    #            (" ku", " aku"), (" gw", " aku"), (" saya", " aku"), (" gue", " aku"), (" gua", " aku"),
    #            (" anda", " kamu"), (" lu", " kamu"), (" kau", " kamu"), (" mu", " kamu"),
    #            (" dia", " dia"), (" doi", " dia"),
    #            (" kita", " kami"),
    #            (" tak", " tidak"), (" engga", " tidak"), (" enggak", " tidak"), (" ga", " tidak"), (" gak", " tidak"),
    #            (" ya", " iya"), (" yes", " iya"), (" yoi", " iya"), (" yah", " iya"),
    #            (" hei", " hai"), (" hey", " hai"), (" halo", " hai"), (" hay", " hai")
    #    ):
    #        text = text.replace(*r)

    aku = ['ku', 'gw', 'saya', 'gue', 'gua']
    kamu = ['anda', 'lu', 'kau', 'mu']