コード例 #1
0
    def getLemmatization(example_sent):
        word_tokens = Tokenization.tokenizationProcess(example_sent)
        filtered_sentence = []

        for w in word_tokens:
            filtered_sentence.append(nltk.ISRIStemmer().suf32(w))
        return filtered_sentence
コード例 #2
0
 def defineStemming(example_sent):
     word_tokens = Tokenization.tokenizationProcess(example_sent)
     filtered_sentence = []
     ps = nltk.ISRIStemmer()
     for w in word_tokens:
         filtered_sentence.append(ps.stem(w))
     return filtered_sentence
コード例 #3
0
class Normalizer :

    stopword_list = open ('res/stop_words.txt','r').read().strip('\n')
    stemmer = nltk.ISRIStemmer()

    def tokenize_text(self,text):
        tokens = nltk.word_tokenize(text)
        tokens = [token.strip() for token in tokens]
        return tokens

    def stemming_text(self,text):
        tokens = self.tokenize_text(text)
        filtered_tokens = [self.stemmer.stem(token) for token in tokens]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text

    def remove_special_characters(self,text):
        def remove_conjunction(token):
            if token[0] == 'و' and len(token) > 4:
                t = token[1:len(token)]
                return t
            else:
                return token
        tokens = self.tokenize_text(text)
        pattern = re.compile(r'[?|.|:|;|,|"|\d|$|&|*|%|@|(|)|~]'.format(re.escape(string.punctuation)))
        filtered_tokens = filter(None, [pattern.sub('',remove_conjunction(token)) for token in tokens])
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text

    def remove_stopwords(self,text):
        tokens = self.tokenize_text(text)
        filtered_tokens = [token for token in tokens if token not in self.stopword_list]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text

    def remove_repeated_characters(self,text):
        tokens = self.tokenize_text(text)
        repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
        match_substitution = r'\1\2\3'
        def replace(old_word):
            if wn.synsets(old_word):
                return old_word
            new_word = repeat_pattern.sub(match_substitution, old_word)
            return replace(new_word) if new_word != old_word else new_word

        correct_tokens = [replace(word) for word in tokens]
        filtered_text = ' '.join(correct_tokens)
        return filtered_text

    def normalize_corpus(self,corpus):
        normalized_corpus = []
        for text in corpus:
            text = self.remove_special_characters(text)
            text = self.remove_stopwords(text)
            text = self.remove_repeated_characters(text)
            text = self.stemming_text(text)
            text = self.remove_stopwords(text)
            normalized_corpus.append(text)
        return normalized_corpus
コード例 #4
0
    def getLemmatization(example_sent):
        lemmatizer = WordNetLemmatizer()
        word_tokens = Tokenization.tokenizationProcess(example_sent)
        lemmatized = []
        for word in word_tokens:
            lemmatized.append(nltk.ISRIStemmer().suf32(word))

        return lemmatized
コード例 #5
0
 def resolve_txt(root, info):
     text = info.context.get('txt')  #.encode('utf8')
     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
     text = re.sub('\w*\d\w*', '', text)
     text = re.sub('[‘’“”«»…]', '', text)
     text = re.sub("-_،؟", '', text)
     lemmatizer = WordNetLemmatizer()
     word_tokens = word_tokenize(text)
     lemmatized = []
     for word in word_tokens:
         lemmatized.append(nltk.ISRIStemmer().suf32(word))
     return {"result": lemmatized}
コード例 #6
0
ファイル: Arabycia.py プロジェクト: tarzikorichi/Arabycia
    def __init__(self, raw_data=None):
        self.analyzer = pyaramorph.Analyzer()
        self.stemmer = nltk.ISRIStemmer()
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.segmenter = nltk.data.load("tokenizers/punkt/english.pickle")

        if raw_data is not None:
            self.raw_data = raw_data
            self.org_data = raw_data

        self.analyze_text()
        self.ambig()
        self.load_corpus('Tashkeela')
        self.select_cand()
        self.print_result()
コード例 #7
0
 def root(word):
     st = nltk.ISRIStemmer()
     tokens = nltk.word_tokenize(word)
     result = [st.stem(w) for w in tokens]
     return result
コード例 #8
0
import re  # for pre-processing text
import string  # for pre-processing text
import nltk  # for processing texts
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# define arabic punctuations
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations
st = nltk.ISRIStemmer()


def clean_text(text):
    """
        DESCRIPTION:
        This function to clean text
        INPUT:
        text: string
        OUTPUT:
        text: string after clean it
    """
    # remove english letters
    text = re.sub("[a-zA-Z]", " ", str(text))
    # remove \n from text
    text = re.sub('\n', ' ', text)
    # remove number
    text = re.sub(r'\d+', '', text)
コード例 #9
0
ファイル: Arabycia.py プロジェクト: urantialife/Arabycia
	def __init__(self):
		self.analyzer = pyaramorph.Analyzer()
		self.stemmer = nltk.ISRIStemmer()
		self.lemmatizer = nltk.WordNetLemmatizer()
		self.segmenter = nltk.data.load("tokenizers/punkt/english.pickle")