コード例 #1
0
class Dataset(object):
    """
    Yield splitted sentence from TSV dataset
    Dataset content example:

    first sentence  0
    second sentence 1
    """
    def __init__(self, csv_file, stem=True, stopwords=True, verbose=True):
        self.csv_file = csv_file
        self.stemmer = StemmerFactory().create_stemmer() if stem else None
        self.stopwords = []
        if stopwords:
            with open(STOPWORDS_FILE, 'r') as f:
                self.stopwords = f.read().splitlines()

    def __iter__(self):
        with open(self.csv_file, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                sentence = []
                if self.stemmer:
                    for token in self.stemmer.stem(row[0]).split():
                        if token not in self.stopwords:
                            sentence.append(token)
                else:
                    for token in row[0].lower().split():
                        if token not in self.stopwords:
                            sentence.append(token)
                if sentence:
                    yield (sentence)
コード例 #2
0
def Preprocessing(data):
    print("Preprocessing")
    cleanData = []
    tokenizer = RegexpTokenizer(r'\w+')
    factory_stopwords = StopWordRemoverFactory()
    stopwordsFact = factory_stopwords.get_stop_words()
    stemmer = StemmerFactory().create_stemmer()
    count = 0
    for kalimat in data:
        removedHttp = re.sub(r"http\S+", '', kalimat)  #hilangin link http
        removedPic = re.sub(r"pic.twitter\S+", '',
                            removedHttp)  #hilangin link pic.twitter
        lower = removedPic.lower()  #casefolding
        tokenized = tokenizer.tokenize(lower)  #tokenizer + punctuation removal
        stopwords = []  #Stopwords removal
        for kata in tokenized:
            if kata not in stopwordsFact:
                stopwords.append(kata)
        stemmed = []  #stemming
        for kata in stopwords:  #stemming
            stemmed.append(stemmer.stem(kata))  #stemming
        cleanData.append(stemmed)
        count += 1
        print(count)
    return cleanData
コード例 #3
0
def cleanText(T,
              fix={},
              lemma=False,
              lan='id',
              stops=set(),
              symbols_remove=True,
              min_charLen=2,
              max_charLen=15,
              fixTag=False,
              fixMix=True):
    if lemma and lan.lower().strip() == 'id':
        lemma = StemmerFactory().create_stemmer()

    pattern = re.compile(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    t = re.sub(pattern, ' ', T)  #remove urls if any
    pattern = re.compile(
        r'ftp[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    )
    t = re.sub(pattern, ' ', t)  #remove urls if any
    t = unescape(t)  # html entities fix
    if fixTag:
        t = fixTags(t)  # fix abcDef
    t = t.lower().strip()  # lowercase
    t = unidecode(t)
    t = ''.join(''.join(s)[:2]
                for _, s in itertools.groupby(t))  # remove repetition
    t = t.replace('\n', ' ').replace('\r', ' ')
    t = sent_tokenize(t)  # sentence segmentation. String to list
    for i, K in enumerate(t):
        if symbols_remove:
            K = re.sub(r'[^.,_a-zA-Z0-9 -\.]', ' ', K)
        if lemma and lan.lower().strip() == 'id':
            listKata = [str(tok) for tok in TextBlob(lemma.stem(K)).words]
        elif lemma and lan.lower().strip() == 'en':
            listKata = [str(tok.lemma_) for tok in nlp_en(K)]
        else:
            listKata = [str(tok) for tok in TextBlob(K).words]

        if fix:
            for j, token in enumerate(listKata):
                if token in fix.keys():
                    listKata[j] = fix[token]
        if stops:
            listKata = [
                tok for tok in listKata
                if tok not in stops and len(tok) >= min_charLen
            ]
        else:
            listKata = [tok for tok in listKata if len(tok) >= min_charLen]

        listKataFixed = []
        if fixMix:
            for j, tok_ in enumerate(listKata):
                listKataFixed += re.split('(\d+)', tok_)
            listKata = listKataFixed

        t[i] = ' '.join(listKata)
    return ' '.join(t)  # Return kalimat lagi
コード例 #4
0
class Preprocessor:
    def __init__(self):
        self.stopwords = StopWordRemoverFactory().get_stop_words()
        self.stemmer = StemmerFactory().create_stemmer()

    def clean(self, words):
        return words.translate(str.maketrans("", "", ".,!?\"'#@%&/();:"))

    def stemming(self, words):
        return self.stemmer.stem(self.clean(words))

    def tokenizing(self, str, delimiter=" "):
        return str.split(delimiter)

    def preprocess(self, words):
        return [
            token for token in self.tokenizing(self.stemming(words))
            if token not in self.stopwords
        ]

    def selected_preprocess(self, words, selected_words):
        return [
            token for token in self.tokenizing(self.stemming(words))
            if token not in self.stopwords and token in selected_words
        ]
コード例 #5
0
class FeatureAnnotator:
    def __init__(self):
        self.nlp = stanfordnlp.Pipeline(lang="id",use_gpu=False, silent=True)
        self.stemmer = StemmerFactory().create_stemmer()
        self.ner = get_entities
        # Set POS Tagger 
        self.pos_tagger = nltk.tag.CRFTagger()
        self.pos_tagger.set_model_file('pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')

    def annotate(self, sentence):
        annotation = defaultdict(list)
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        doc = self.nlp(sentence)
        
        annotation['ner_tags'] = self.ner(sentence)
        
        word_dict = defaultdict(int)
        
        for sent in doc.sentences:
            for idx, word in enumerate(sent.words):
                annotation['tokens'].append(word.text)
                stemmed_word = self.stemmer.stem(word.text)                
                if (annotation['ner_tags'][idx] in ['PER','ORG']):
                    stemmed_word = word.text.lower()
                annotation['lemmas'].append(stemmed_word+'_{}'.format(word_dict[stemmed_word]))
                annotation['dependency'].append(dict(relation=word.dependency_relation, head=word.governor))
        
        annotation['pos_tags'] = [tag[1] for tag in self.pos_tagger.tag(annotation['tokens'])]
                    
        return annotation
コード例 #6
0
    def preprocess(self, documents):
        print("[{}] Preprocessing...".format(dt.now()))
        stemmer = StemmerFactory().create_stemmer()
        stopwords = self.data.get_stopword()
        formal_dict = self.data.get_formalization()
        formal_pattern = re.compile(r'\b(' + '|'.join(formal_dict.keys()) +
                                    r')\b')
        url_pattern = re.compile(
            r'((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?'
        )
        digit_symbol_pattern = re.compile(r'\d+|[^\w\s]')
        user_handler_pattern = re.compile(r'@\w+')
        processed_docs = []
        print("  ", end='')
        runtime = []
        for doc in documents:
            st = timeit.default_timer()
            new_doc = re.sub(url_pattern, "", doc)
            new_doc = re.sub(user_handler_pattern, "", new_doc)
            new_doc = re.sub(digit_symbol_pattern, " ", new_doc)
            new_doc = new_doc.lower()
            new_doc = formal_pattern.sub(lambda x: formal_dict[x.group()],
                                         new_doc)
            new_doc = stemmer.stem(new_doc)
            new_doc = new_doc.split()
            new_doc = [word for word in new_doc if word not in stopwords]

            print('.', end='')
            processed_docs.append(new_doc)
            runtime.append(timeit.default_timer() - st)
        self.data.save_processed_docs(processed_docs)
        print("\nAverage Preprocessing time : " +
              str(sum(runtime) / len(runtime)))
        return processed_docs
コード例 #7
0
def stemming(documents):
    stemmer = StemmerFactory().create_stemmer()
    stemmed = []
    for document in documents:
        words = []
        for word in document:
            words.append(stemmer.stem(word))
        stemmed.append(words)

    return stemmed
コード例 #8
0
ファイル: taubot_logic.py プロジェクト: hipzulaj/taubot
def preprocessing(dataset):
    stemmer = StemmerFactory().create_stemmer()
    stopwords = StopWordRemoverFactory().create_stop_word_remover()
    for row in dataset:
        row['message'] = row.get('message').casefold()
        row['message'] = re.sub(r"[0-9]", "", row.get('message'))
        row['message'] = re.sub('[' + string.punctuation + ']', "",
                                row.get('message'))
        row['message_stopwords'] = stopwords.remove(row['message'])
        row['message_stemmed'] = stemmer.stem(row['message_stopwords'])
        row['message_tokenized'] = word_tokenize(row['message_stemmed'])
コード例 #9
0
ファイル: main.py プロジェクト: Nurhalizah/pythonProject1
def clean(doc):
    # menghilangkan kata tidak penting
    stop_free = " ".join([i for i in doc.lower().split() if i not in stopword])
    # menghilangkan tanda baca
    punc_free = ''.join(ch for ch in stop_free if ch not in punctuation)
    # menjadikan ke kata dasar
    stemmer = StemmerFactory().create_stemmer()
    normalized = stemmer.stem(punc_free)
    # menghilangkan angka
    processed = re.sub(r"\d+", "", normalized)
    # membuat satu dokumen menjadi array berisi tiap kata
    y = processed.split()
    return y
コード例 #10
0
 def __init__(self, input, file_location):
     data = self.dataFromFile(file_location)
     stopword = StopWordRemoverFactory().create_stop_word_remover()
     stemmer = StemmerFactory().create_stemmer()
     input = stopword.remove(input.lower())
     input = stemmer.stem(input)
     valid = 0
     for i in range(len(data)):
         kal = stopword.remove(data[i][0].lower())
         kal = stemmer.stem(kal)
         if (self.bm(input.lower(), kal.lower()) != -1):
             if (valid == 0):
                 percent = len(input) * 100 / len(kal)
                 # print("Confidence1 : " + str(percent))
                 if (percent > 80):
                     self.answere = data[i][1]
                 valid = 1
         else:
             if valid == 0:
                 if (self.bm2(input.lower(), kal.lower()) >= 80):
                     # print("Confidence2 : " + str(bm2(input.lower(), kal.lower())))
                     self.answere = data[i][1]
                     valid = 1
コード例 #11
0
    def clean_text(self, data):
      stopword = StopWordRemoverFactory().create_stop_word_remover()
      stemmer = StemmerFactory().create_stemmer()

      data = re.sub('[^a-zA-Z]',' ', str(data).lower())
      data = re.sub('\byok\b |\byuk\b', 'ayo', data)
      data = re.sub('\bmager\b', 'males', data)
      data = re.sub('\bmalas\b', 'males', data)
      data = re.sub('\bmls\b', 'males', data)
      data = re.sub('\bkuy\b', 'yuk', data)
      data = re.sub('\borg\b', 'orang', data)
      data = re.sub('\bjg\b', 'juga', data)
      data = re.sub('\budh\b', 'sudah', data)
      data = re.sub('\bmangat\b', 'semangat', data)
      data = re.sub('\bcemungut\b', 'semangat', data)
      data = re.sub('\bgas\b', 'yuk', data)
      data = re.sub('\benakeun\b', 'enak', data)
      data = re.sub('\bnaek\b', 'naik', data)
      data = re.sub('\bmmg\b', 'memang', data)
      data = re.sub('\bga\b', 'engga', data)
      data = re.sub('\bengga\b', 'tidak', data)
      data = re.sub('\bttg\b', 'tentang', data)
      data = re.sub('\brush hour\b', 'jam sibuk', data)
      data = re.sub('\bku\b', 'aku', data)
      data = re.sub('\bgak\b', 'tidak', data)
      data = re.sub('\bdgn\b', 'dengan', data)
      data = re.sub('\bbailk\b', 'pulang', data)
      data = re.sub('\bgatau\b', 'tidak tahu', data)
      data = re.sub('\bbat\b', 'banget', data)
      data = re.sub('\bampe\b', 'sampai', data)
      data = re.sub('\blg\b', 'sedang', data)
      data = re.sub('\banjay\b', 'asik', data)
      data = re.sub('\banjg\b', 'anjing', data)
      data = re.sub('\banjiing\b', 'anjing', data)
      data = re.sub('\bantum\b', 'kamu', data)
      data = re.sub('\basiq\b |\basyique\b |\basik\b', 'asyik', data)
      data = re.sub('\bbgt\b |\bbanget\b |\bbanged\b', 'sangat', data)
      data = re.sub('\bribet\b', 'repot', data)

      data = data.split()
      data = ' '.join(data)

      #setelah ngeganti baru ilangin stopword dan imbuhan kata dibawah ini
      #sastrawi remove stopwords
      data = stopword.remove(data) #stopword nya udah di di provide sastrawi
      #sastrawi stemming
      data = stemmer.stem(data)

      return data
コード例 #12
0
def preprocess_text(input):
  #lowercase all character in the text
  text = input[0]
  text = text.lower()
  #remove punctuation
  text = text.translate(str.maketrans("","",string.punctuation))
  #remove leading and trailing whitespace
  text = text.strip()
  #remove StopWord
  stopword = StopWordRemoverFactory().create_stop_word_remover()
  text = stopword.remove(text)
  #stemming
  stemmer = StemmerFactory().create_stemmer()
  text = stemmer.stem(text)
  return text
コード例 #13
0
class Preprocessor():
    def __init__(self):
        self.stopwords = StopWordRemoverFactory().get_stop_words()
        self.stemmer = StemmerFactory().create_stemmer()

    def stemming(self, words):
        return self.stemmer.stem(words)

    def tokenizing(self, str, delimiter=" "):
        return str.split(delimiter)

    def preprocess(self, words):
        return [
            token for token in self.tokenizing(self.stemming(words))
            if token not in self.stopwords
        ]
コード例 #14
0
def extract_text(extracted_path, id_wiki, stem):
    if os.path.isfile(extracted_path):
        return None
    if stem:
        print(
            'Warning : Using stemmer could slow down the extracting progress')
        stemmer = StemmerFactory().create_stemmer()
    with open(extracted_path, 'w') as f:
        i = 0
        for text in id_wiki.get_texts():
            text = ' '.join(text)
            text = stemmer.stem(text) if stem else text
            f.write(text + '\n')
            i += 1
            if i % (10 if stem else 1000) == 0:
                print(str(i), 'articles processed')
        print('total:', str(i))
    return None
コード例 #15
0
class Preprocess:
    def __init__(self):
        self.stemmer = StemmerFactory().create_stemmer()
        self.remover = StopWordRemoverFactory().create_stop_word_remover()

    def preprocess(self, text):
        # # 1 stemming
        text_stem = self.stemmer.stem(text)
        #
        # # 2 hapus stop words
        text_clean = self.remover.remove(text_stem)
        #
        # # 3 tokenization
        # # 3.1 lowercase
        lowercase = text_clean.lower()
        preprocessed_text = lowercase.translate(None,
                                                string.punctuation).split()

        return preprocessed_text
コード例 #16
0
ファイル: taubot_logic.py プロジェクト: hipzulaj/taubot
def respond(strg):
    levenshtein = Levenshtein()
    stemmer = StemmerFactory().create_stemmer()
    stopwords = StopWordRemoverFactory().create_stop_word_remover()

    kategori = model.predict([strg])

    txt = stopwords.remove(strg)
    txt = stemmer.stem(txt)

    best = 1000
    res = []

    for words in dataset:
        if (words['category'] == kategori):
            distance = levenshtein.distance(txt, words['message_stemmed'])

            if (distance < best):
                best = distance
                res = words
    return res['respond']
コード例 #17
0
class SimpleIndonesianPreprocessor(BaseEstimator, TransformerMixin):
    """
    Simple Indonesian text preprocessor
    """
    def __init__(self, stem=True, stopwords=True, verbose=True):
        self.stemmer = StemmerFactory().create_stemmer() if stem else None
        self.stopwords = []
        if stopwords:
            with open(STOPWORDS_FILE, 'r') as f:
                self.stopwords = f.read().splitlines()
        self.verbose = verbose

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        results = []
        if self.verbose:
            print('Preprocessing..')
            bar = progressbar.ProgressBar()
            for doc in bar(X):
                results.append(list(self.tokenize(doc)))
            return results
        else:
            return [list(self.tokenize(doc)) for doc in X]

    def tokenize(self, document):
        if self.stemmer:
            # stem and split by whitespaces
            for token in self.stemmer.stem(document).split():
                if token not in self.stopwords:
                    yield token
        else:
            for token in document.lower().split():
                if token not in self.stopwords:
                    yield token
コード例 #18
0
class NERFeatureExtractor:
    def read_label_file(self, filename):
        return open(filename).read().split('\n')

    def __init__(self, iob_predictor):
        self.iob_predictor = iob_predictor
        self.stemmer = StemmerFactory().create_stemmer()
        self.TAGGER3 = CRFTagger()
        self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
        self.label_words = self.read_label_file('label-words.txt')
        self.label_posses = self.read_label_file('label-posses.txt')
        self.label_lemmas = self.read_label_file('label-lemmas.txt')
        self.label_iob_feature = self.read_label_file('label-iob_feature.txt')
        self.label_iob_classes = self.read_label_file('label-iob_classes.txt')

    def getPOSTag(self, _temporary_tokens):
        strin = []
        for token_tag in _temporary_tokens:
            strin.append(unicode(token_tag.decode('utf-8')))

        return [(token.encode('ascii',
                              'ignore'), tag.encode('ascii', 'ignore'))
                for (token, tag) in self.TAGGER3.tag_sents([strin])[0]]

    def features(self, tokens, index, history):
        # print history
        # print tokens
        """
		`tokens`  = a POS-tagged sentence [(w1, t1), ...]
		`index`   = the index of the token we want to extract features for
		`history` = the previous predicted IOB tags
		"""

        # Pad the sequence with placeholders
        tokens = [
            ('[START2]', '[START2]'), ('[START1]', '[START1]')
        ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = prevword == prevword.capitalize()
        nextcapitalized = prevword[0] in string.ascii_uppercase

        return [
            word,
            str(self.stemmer.stem(word)),
            str(pos),
            str(allascii),
            str(nextword),
            str(self.stemmer.stem(nextword)),
            str(nextpos),
            str(nextnextword),
            str(nextnextpos),
            str(prevword),
            str(self.stemmer.stem(prevword)),
            str(prevpos),
            str(prevprevword),
            str(prevprevpos),
            str(previob),
            str(contains_dash),
            str(contains_dot),
            str(allcaps),
            str(capitalized),
            str(prevallcaps),
            str(prevcapitalized),
            str(nextallcaps),
            str(nextcapitalized)
        ]

    def normalizeFeature(self, featx):
        out = []
        if featx[0] in self.label_words:
            out.append(self.label_words.index(featx[0]))
        else:
            out.append(-1)

        if featx[1] in self.label_lemmas:
            out.append(self.label_lemmas.index(featx[1]))
        else:
            out.append(-1)

        if featx[2] in self.label_posses:
            out.append(self.label_posses.index(featx[2]))
        else:
            out.append(-1)

        out.append(1 if featx[3] else 0)

        if featx[4] in self.label_words:
            out.append(self.label_words.index(featx[4]))
        else:
            out.append(-1)

        if featx[5] in self.label_lemmas:
            out.append(self.label_lemmas.index(featx[5]))
        else:
            out.append(-1)

        if featx[6] in self.label_posses:
            out.append(self.label_posses.index(featx[6]))
        else:
            out.append(-1)

        if featx[7] in self.label_words:
            out.append(self.label_words.index(featx[7]))
        else:
            out.append(-1)

        if featx[8] in self.label_posses:
            out.append(self.label_posses.index(featx[8]))
        else:
            out.append(-1)

        if featx[9] in self.label_words:
            out.append(self.label_words.index(featx[9]))
        else:
            out.append(-1)

        if featx[10] in self.label_lemmas:
            out.append(self.label_lemmas.index(featx[10]))
        else:
            out.append(-1)

        if featx[11] in self.label_posses:
            out.append(self.label_posses.index(featx[11]))
        else:
            out.append(-1)

        if featx[12] in self.label_words:
            out.append(self.label_words.index(featx[12]))
        else:
            out.append(-1)

        if featx[13] in self.label_posses:
            out.append(self.label_posses.index(featx[13]))
        else:
            out.append(-1)

        if featx[14] in self.label_iob_feature:
            out.append(self.label_iob_feature.index(featx[14]))
        else:
            out.append(-1)

        out.append(1 if featx[15] else 0)
        out.append(1 if featx[16] else 0)
        out.append(1 if featx[17] else 0)
        out.append(1 if featx[18] else 0)
        out.append(1 if featx[19] else 0)
        out.append(1 if featx[20] else 0)
        out.append(1 if featx[21] else 0)
        out.append(1 if featx[22] else 0)

        return out

    def parseEntityName(self, _sent=""):
        tokens = self.getPOSTag(_sent.split())
        history = []
        self.res_all = []
        last_feature = []
        for i in range(len(tokens)):
            last_feature = self.features(tokens, i, history)
            iob_res = self.iob_predictor([self.normalizeFeature(last_feature)
                                          ])[0]
            history.append(iob_res)
            self.res_all.append((tokens[i], self.label_iob_classes[iob_res]))
コード例 #19
0
class Stemmer:
  def __init__(self):
    self.stemmer = StemmerFactory().create_stemmer()

  def stem(self, text):
    return self.stemmer.stem(text)
コード例 #20
0
class Preprocess:
    def __init__(self, preprocessing_dataset=None):
        self.tokenizer = TweetTokenizer()
        self.stop_words = dictionary.get_stop_words()
        self.base_words = dictionary.get_base_words()
        self.slang_words = dictionary.get_slang_words()
        self.stemmer = StemmerFactory().create_stemmer()
        self.preprocessing_dataset = preprocessing_dataset

    def case_folding(self, document):
        document = document.lower()
        return document

    def clean(self, document):
        document = re.sub(
            r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',
            '', document)  # URLs
        document = re.sub(r'RT', '', document)  # Retweet
        document = re.sub(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)', '',
                          document)  # Hashtag
        document = re.sub(r'(?:@[\w_]+)', '', document)  # Mention
        document = re.sub(r'[^\x00-\x7F]+', '', document)  # Unicode
        document = re.sub(r'rt', '', document)  # Retweet
        document = re.sub(r'(?:[:=;][oO\-]?[D\)\]\(\]/\\OpP])', '',
                          document)  # Emoticon
        document = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', '',
                          document)  # Special Char
        document = re.sub(r'[\n\t\r]+', '',
                          document)  # Remove linebreak, tab, return
        return document

    def tokenize(self, document):
        tokenized_document = self.tokenizer.tokenize(document)
        return tokenized_document

    def stopword_removal(self, document):
        document = [word for word in document if word not in self.stop_words]
        return document

    def stem(self, document):
        document = self.stemmer.stem(document)
        return document

    def slang_word_correction(self, document):
        pattern = re.compile(r'\b(' + '|'.join(self.slang_words.keys()) +
                             r')\b')
        document = pattern.sub(lambda x: self.slang_words[x.group()], document)
        return document

    def base_word_check(self, document):
        document = [word for word in document if word in self.base_words]
        return document

    def preprocess(self, document):
        document = self.case_folding(document)
        document = self.clean(document)
        document = self.slang_word_correction(document)
        document = self.stem(document)
        document = re.sub(r'[^a-zA-Z ]', '', document)  # Special Char
        document = self.tokenize(document)
        document = self.stopword_removal(document)
        document = self.base_word_check(document)
        return document

    def list_to_string(self, document):
        str = ' '.join(document)
        return str

    def save_preprocessed_text(self, file_name, document):
        with open(file_name, 'w') as outfile:
            json.dump(document, outfile, indent=4)

        url = 'https://unikom-sentiment-services.azurewebsites.net/upload-ps'
        files = {'json': open(file_name, 'rb')}
        request = requests.post(url, files=files)
        if (request.status_code == 200):
            return True

        return False

    def load_preprocessed_text(self):
        collection = self.preprocessing_dataset.find({})
        if collection.count() == 0:
            return False

        for preprocessing_data in collection:
            words_vocabulary = preprocessing_data['data'][0]
            classes = preprocessing_data['data'][1]

        return words_vocabulary, classes
コード例 #21
0
class HadistRetrieval:
    def __init__(self):
        self.stopwords = stopwords
        self.stemmer = StemmerFactory().create_stemmer()
        self.hadist = hadist
        vectorizer = TfidfVectorizer()
        self.X = vectorizer.fit_transform(self.hadist.Processed)
        self.features = vectorizer.get_feature_names()

    def _text_lower(self, text: str) -> str:
        return text.lower()

    def _remove_entities(self, text: str) -> str:
        return re.sub(r'\[[^]]*\]', '', text)

    def _case_folding(self, text: str) -> str:
        return re.sub(r'[^a-z]', ' ', re.sub("'", '', text))

    def _stemming(self, text: str) ->str:
        return self.stemmer.stem(text)

    def _stopwords_removal(self, text: str) -> str:
        texts_token = text.split()
        not_stopword = []
        for token in texts_token:
            if token not in self.stopwords:
                not_stopword.append(token)
        return ' '.join(not_stopword)

    def _preprocessing(self, text: str) -> str:
        tx_lower = self._text_lower(text)
        tx_remove_entities = self._remove_entities(tx_lower)
        tx_case_folding = self._case_folding(tx_remove_entities)
        tx_stemming = self._stemming(tx_case_folding)
        return self._stopwords_removal(tx_stemming)

    def retrieve(self, sentence: str, n: int = 5) -> List[Mapping[str, str]]:
        sent_prep = self._preprocessing(sentence)
        query = sent_prep.split()
        res = np.zeros(self.X.shape[0])
        not_in_corpus = []
        output: List[Mapping[str, str]] = []

        for keyword in query:
            try:
                res += self.X.toarray()[:,self.features.index(keyword)]
            except:
                not_in_corpus.append(keyword)
                res = np.zeros(self.X.shape[0])

        top_idx = np.argsort(-res)[:n]

        if not sum(res) > 0:
            raise ValueError('dindt match, someting wrong')

        for i in range(len(top_idx)):
            # res[top_idx[i]]
            output.append({
                'source': self.hadist.iloc[top_idx[i]][4],
                'text': self.hadist.iloc[top_idx[i]][2]
            })

        return output
コード例 #22
0
def stem(data):
    new_data = data.copy()
    stemmer = StemmerFactory().create_stemmer()

    return list(map(lambda s: stemmer.stem(s), new_data))
コード例 #23
0
 def stemming(self, token):
     stemmer = StemmerFactory().create_stemmer()
     stemmed = [stemmer.stem(' '.join(row)).split() for row in token]
     self.print_arr("Setelah Stemming:", stemmed)
     return stemmed
コード例 #24
0
ファイル: TextSummarizer.py プロジェクト: gslayer0/fp_Datmin
class TextSummarizer:
    def __init__(self, title: str, plot: str, human_synopsis: str):
        self.title = title
        self.plot = plot
        self.human_synopsis = human_synopsis
        self.stopwords = StopWordRemoverFactory().create_stop_word_remover()
        self.stemmer = StemmerFactory().create_stemmer()

    def __text_to_sentences(self, text: str) -> List[str]:
        regex = re.compile('\.\n\n|\.\n|\. |\.$')
        sentences = regex.split(text)
        return sentences

    def __stem_sentence(self, sentence: str) -> str:
        return self.stemmer.stem(sentence)

    def __stop_word_removal(self, words: List[str]) -> List[str]:
        temp_words = []
        for word in words:
            if word.lower() in self.title.lower():
                temp_words.append(word)
            else:
                temp = self.stopwords.remove(word)
                if temp:
                    temp_words.append(temp)

        return temp_words

    def __preprocess_text(self, text: str) -> tuple:
        temp_sentences = self.__text_to_sentences(text)
        sentences = []
        preprocessed_sentences = []
        for sentence in temp_sentences:
            if len(sentence) < 2:
                continue

            stemmed_sentence = self.__stem_sentence(sentence.lower())
            tokenized_sentence = nltk.tokenize.word_tokenize(stemmed_sentence)
            removed_stop_word_sentence = self.__stop_word_removal(
                tokenized_sentence)

            if len(removed_stop_word_sentence) < 2:
                continue

            sentences.append(sentence)
            preprocessed_sentences.append(removed_stop_word_sentence)

        return sentences, preprocessed_sentences

    def __sentence_similarity(self, sent1, sent2):
        """
        calculate the similarity between sentence!
        return distance between sentences
        """
        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            vector2[all_words.index(w)] += 1

        return 1 - cosine_distance(vector1, vector2)

    def __build_similarity_matrix(self, sentences):
        """
        make a matrix to plot the similarity between sentences in a file
        return matrix
        """
        # Create an empty similarity matrix
        similarity_matrix = np.zeros((len(sentences), len(sentences)))

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:  # ignore if both are same sentences
                    continue
                similarity_matrix[idx1][idx2] = self.__sentence_similarity(
                    sentences[idx1], sentences[idx2])

        return similarity_matrix

    def summarize(self, top_n=5):
        summarize_text = []

        # Step 1 - text preprocessing
        plot_sentences, plot_pre_sentences = self.__preprocess_text(self.plot)

        # Step 2 - Generate Similary Martix across sentences
        sentence_similarity_martix = self.__build_similarity_matrix(
            plot_pre_sentences)

        print(sentence_similarity_martix)
        # Step 3 - Rank sentences in similarity martix
        sentence_similarity_graph = nx.from_numpy_array(
            sentence_similarity_martix)
        plot_scores = nx.pagerank(sentence_similarity_graph)

        # Step 4 - Sort the rank and pick top sentences
        ranked_sentence = []
        for i in range(len(plot_scores)):
            ranked_sentence.append([plot_scores[i], plot_sentences[i], i])

        ranked_sentence.sort(key=lambda x: x[0], reverse=True)
        top_n = min(top_n, len(plot_sentences))
        summary = ranked_sentence[0:top_n]
        summary.sort(key=lambda x: x[2])
        summary = [i[1] for i in summary]
        summarize_text = ""
        for i in range(top_n):
            summarize_text += "".join(summary[i]) + ". "

        # Step 5 - Offcourse, output the summarize texr
        return summarize_text

    @staticmethod
    def generate_from_file(title, plotfilepath, synopsisfilepath):
        plot = ""
        synopsis = ""
        with open(plotfilepath, "r") as plot_file:
            plot = plot_file.read()
        with open(synopsisfilepath, "r") as synopsis_file:
            synopsis = synopsis_file.read()

        ts = TextSummarizer(title, plot, synopsis)
        return ts.summarize()
コード例 #25
0
while (True):
    print("Enter query keyword:")
    init_query = input()
    query = init_query

    # without query expansion

    print("==== Without query expansion ====")
    query = query.lower()
    remove_punctuation_map = dict(
        (ord(char), None) for char in string.punctuation)
    query = query.translate(remove_punctuation_map)
    query = stopword.remove(query)
    query = query.split()
    query = [stemmer.stem(x) for x in query]
    print("Query used: " + ' '.join(query))

    # process the query

    print("Processing query...")
    max_result = []
    x = [' '.join(query)]
    paper_tfidf = vectorizer.fit_transform(x + processed_paper)
    q = paper_tfidf[0]
    result = cosine_similarity(paper_tfidf, q)
    idx = np.argsort(-result, axis=0).flatten()
    final = [[num, y[0], x] for num, y in enumerate(result) if y[0] > 0.0]
    max_result += final
    max_result = sorted(max_result, key=lambda x: x[1], reverse=True)
    set_result = set()
コード例 #26
0
ファイル: score.py プロジェクト: nafladiva/search-engine
def index(hashs, terms):
    for word in terms:
        if word in hashs:
            hashs[word] += 1
        else:
            hashs[word] = 1    

print('Indexing ...')
for path in sorted(IN_DIR.glob('*/*.html')):
    with open(path.resolve(), 'r', encoding='utf-8') as file:
        df[path.name] = dict()

        content = get_text(['title', 'top', 'middle', 'bottom'], file.read())
        content = content.translate(str.maketrans('','', punctuation))
        content = stopword.remove(content)
        terms = stemmer.stem(content.lower()).split()

        index(df[path.name], terms)
        index(tf, terms)
print('Indexing done!\n')

print('Calculating idf for terms...')
for term, freq in tf.items():
    df_i = 0
    for doc, tf_doc in df.items():
        df_i += 1 if term in tf_doc else 0
    idf[term] = (1 + math.log2(len(df)/df_i)) if df_i != 0 else 1
print('Calculated!\n')

with open(BASE_DIR / 'words_score.txt', 'w', encoding='utf-8') as file:
    print('Writing words score to text file ...')
コード例 #27
0
    docs_x = []
    docs_y = []

    for intent in data["intents"]:
        #print(intent)
        for pattern in intent["patterns"]:
            wrds = nltk.word_tokenize(pattern)
            print(wrds)
            words.extend(wrds)
            docs_x.append(wrds)
            docs_y.append(intent["tag"])

        if intent["tag"] not in labels:
            labels.append(intent["tag"])

    words = [stemmer.stem(w.lower()) for w in words if w != "?"]
    words = sorted(list(set(words)))
    print(words)

    labels = sorted(labels)

    training = []
    output = []

    out_empty = [0 for _ in range(len(labels))]

    for x, doc in enumerate(docs_x):
        bag = []

        wrds = [stemmer.stem(w) for w in doc]
コード例 #28
0
class Normalizer:
    id_mapper = None
    en_mapper = None
    id_words = None
    en_words = None

    def __init__(self, id_mapper, en_mapper, id_words, \
                 en_words, contracted_words_mapper):
        self.id_mapper = id_mapper
        self.en_mapper = en_mapper
        self.id_words = id_words
        self.en_words = en_words
        self.contracted_words_mapper = contracted_words_mapper
        self.en_stemmer = PorterStemmer()
        self.id_stemmer = StemmerFactory().create_stemmer()
    
    def _lookup_id(self, token):
        stemmed_token = self.id_stemmer.stem(token)
        if self.id_words and (stemmed_token in \
                              self.id_words or token in self.id_words):
            return token
        if self.id_mapper and token in self.id_mapper:
            return self.id_mapper[token]
        return None
    
    def _lookup_en(self, token):
        stemmed_token = self.en_stemmer.stem(token)
        if self.en_words and (stemmed_token in \
                                self.en_words or token in self.en_words):
            return token
        if self.en_mapper and token in self.en_mapper:
            return self.en_mapper[token]
        return None
    
    def _normalize_id(self, token):
        # mengatasi kasus nomor 6
        if '2' in token and token.index('2') != 0:
            unnormalized_singulars = token.split('2')
            norm_sing_1 = self._lookup_id(unnormalized_singulars[0])
            norm_sing_2 = self._lookup_id(unnormalized_singulars[1])
            if not norm_sing_1:
                norm_sing_1 = unnormalized_singulars[0]
            if not norm_sing_2:
                norm_sing_2 = unnormalized_singulars[1]
            res = f'{norm_sing_1}-{norm_sing_1}'
            if norm_sing_2:
                res += norm_sing_2
            return res

        return self._lookup_id(token)
    
    def _normalize_en(self, token):
        # Handle case #9, affix nge-
        if token.startswith('nge'):
            token = token[3:]
            return self._lookup_en(token)
        
        # Handle case #10, affix -nya
        elif token.endswith('nya'):
            token = token[:-3]
            norm_token = self._lookup_en(token)
            if norm_token is not None:
                return f'the {norm_token}'
            else:
                return f'the {token}'

        token = self._lookup_en(token)

        # Handle case #8, contracted words
        if token in self.contracted_words_mapper:
            token = self.contracted_words_mapper[token]
        
        return token
    
    def _normalize(self, word, lang):
        res = None
        if lang == 'id':
            res = self._normalize_id(word)
        if lang == 'en':
            res = self._normalize_en(word)
        if res:
            return res
        return None

    def normalize(self, token, lang='un'):
        if not is_word(token):
            return token

        token = remove_duplication(token).lower()

        # Handle multiple words
        if ' ' in token:
            unnormalized_singulars = token.split(' ')
            normalized_singulars = []
            for sing in unnormalized_singulars:
                norm = self.normalize(sing, lang)
                normalized_singulars.append(norm if norm else sing)
            
            if normalized_singulars[0] == normalized_singulars[1]:
                return '-'.join(normalized_singulars)
            else:
                return ' '.join(normalized_singulars)

        possible_words = [token]
        possible_words.extend(self._generate_all_words(token))

        for word in possible_words:
            res = self._normalize(word, lang)
            if res:
                return res
        
        return token

    
    def _generate_all_words(self, token):
        single_token = ''
        ids = []
        for idx, char in enumerate(token):
            if idx > 0 and token[idx-1] == char:
                ids.append(len(single_token) - 1)
            else:
                single_token += char
        
        possible_words = []
        max_iter = 1 << len(ids)
        len_ids = len(ids)

        for i in range(0, max_iter):
            bin_ids = '{:08b}'.format(i).lstrip('0')

            appeared_id = []

            for idx, bin_id in enumerate(bin_ids):
                if bin_id == '1':
                    appeared_id.append(ids[idx])
            
            word = ''

            for idx, char in enumerate(single_token):
                word += char
                if idx in appeared_id:
                    word += char
            
            possible_words.append(word)
        
        return possible_words
コード例 #29
0
    paper = pickle.load(f)
print("Preprocessing..")
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()
words = []
processed_paper = []
for num, x in enumerate(paper):
    text = x[2]
    text = text.lower()
    remove_punctuation_map = dict(
        (ord(char), None) for char in string.punctuation)
    text = text.translate(remove_punctuation_map)
    text = stopword.remove(text)
    text = text.split()
    text = [stemmer.stem(x) for x in text]
    processed_paper.append(' '.join(text))
    text = list(set(text))
    words += text
    print("Paper " + str(num + 1) + " done.")
print("Done processing.")

# save results to 'corpus/processed_paper.xlsx'

print("Saving data to corpus/processed_paper.xlsx..")
df = pd.DataFrame(processed_paper)
df.to_excel('corpus/processed_paper.xlsx', header=False, index=False)
print("Success.")

# save results to 'pickle/processed_paper.pkl'
コード例 #30
0
# Preporcess data
for intent in intents['intents']:
    for pattern in intent['input_patterns']:

        # Word tokenization
        pattern = nltk.word_tokenize(pattern)
        # Case folding
        pattern = [word.lower() for word in pattern]
        # Filtering
        pattern = [
            word for word in pattern if word not in stopwords
            and word.isalpha() and word not in string.punctuation
        ]
        # Stemming
        pattern = [stemmer.stem(word) for word in pattern]

        # insert to words list
        words.extend(pattern)

        # add doc in corpus
        documents.append((pattern, intent['tag']))

        # add tag to class list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

# Sort words and classes
words = sorted(list(set(words)))
classes = sorted(list(set(classes)))