コード例 #1
0
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
コード例 #2
0
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
コード例 #3
0
def light_stem(text):
    words = text
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)
        if word not in stemmer.stop_words:
            word = stemmer.pre32(word)
            word = stemmer.suf32(word)
            word = stemmer.waw(word)
            word = stemmer.norm(word, num=2)
            result.append(word)
    return ' '.join(result)
コード例 #4
0
def read_data_from_file(
    filename, number_of_classes
):  #"reading and preparing the training or testing datasets by extracting and separating tweets from labels."

    tweets = []  # list of text samples
    labels = []  # list of label ids

    istemmer = ISRIStemmer()
    read_file = open(filename, "r+")  # read and write mode

    for line in read_file:
        tweet = ""

        filtered_line = line.split()  # to get the tweet itself

        label = list(map(int, filtered_line[-11:]))

        for word in filtered_line[1:-11]:
            tweet += word + " "

        tweet = tweet[:-1]
        tweet = clean_str(tweet)

        tweet = istemmer.norm(tweet)

        tweets.append(tweet)
        labels.append(label)

    read_file.close()

    return [tweets, labels]
コード例 #5
0
def read_data_from_file(
    filename
):  #"reading and preparing the training or testing datasets by extracting and separating tweets from labels."

    tweets = []  # list of text samples
    labels = []  # list of label ids
    labels_index = {}  # dictionary mapping label name to numeric id

    istemmer = ISRIStemmer()
    read_file = open(filename, "r+")  # read and write mode

    index = 0
    for line in read_file:

        line = line.split('\t')  # to get the tweet itself

        label = line[0]
        tweet = line[1].strip(" \"")

        tweet = clean_str(tweet)
        tweet = istemmer.norm(tweet)

        if (label not in labels_index):
            labels_index[label] = index
            index += 1

        tweets.append(tweet)
        labels.append(labels_index[label])

    read_file.close()

    return [tweets, labels]
コード例 #6
0
def arabic_social_media_text_filter(txt, debug=0):
    """
    This filter is for filtering Arabic text from social media.

    :param txt: utf-8 text, unicode
    :param debug: Any value greater than 0 prints messages about normalized vs original text.
    :param return:
    """
    txt = social_media_text_filter(txt, debug=debug)
    # Remove diacritics
    st = ISRIStemmer()
    txt = st.norm(txt)
    return txt
コード例 #7
0
ファイル: filters.py プロジェクト: mitll/LiLAC
def arabic_social_media_text_filter(txt, debug=0):
    """
    This filter is for filtering Arabic text from social media.

    :param txt: utf-8 text, unicode
    :param debug: Any value greater than 0 prints messages about normalized vs original text.
    :param return:
    """
    txt = social_media_text_filter(txt, debug=debug)
    # Remove diacritics
    st = ISRIStemmer()
    txt = st.norm(txt)
    return txt
コード例 #8
0
def light_stem(text):
    words = text.split()
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        if not word in stemmer.stop_words:    # exclude stop words from being processed
            word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
            word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
            word = stemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
            word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
#             word=stemmer.pro_w4(word)         #process length four patterns and extract length three roots
#             word=stemmer.pro_w53(word)        #process length five patterns and extract length three roots
#             word=stemmer.pro_w54(word)        #process length five patterns and extract length four roots
#             word=stemmer.end_w5(word)         #ending step (word of length five)
#             word=stemmer.pro_w6(word)         #process length six patterns and extract length three roots
#             word=stemmer.pro_w64(word)        #process length six patterns and extract length four roots
#             word=stemmer.end_w6(word)         #ending step (word of length six)
#             word=stemmer.suf1(word)           #normalize short sufix
#             word=stemmer.pre1(word)           #normalize short prefix
            
        result.append(word)
    return ' '.join(result)
コード例 #9
0
class BasicStemmer(Stemmer):
    def __init__(self):
        self.stemmer = ISRIStemmer()
        self.stopWordsIndex = ArabicStopWordsIndex(self)
        self.stopWordsIndex.buildIndex()

    def getStems(self, tokens, flag=False):

        rootList = []

        for token in tokens:
            #token=stemmer.norm(token)
            root = self.stemmer.pre32(token)
            rootList.append(root)
            print(token, "  :  ", root)

        return rootList

    def stem(self, word):
        root = self.stemmer.pre32(word)
        root = self.stemmer.norm(root, 3)

        return root

    def loadStemsDictionnary(self, filePath="dictStems.txt"):
        lines = open(filePath, "r", encoding="windows-1256").readlines()
        dictionary = nltk.defaultdict(list)
        for line in lines:
            if not re.match("^;.*", line):
                parts = line.split('\t')
                if len(parts) != 4:
                    break
                else:
                    [rootStem, stem, tag, enGloss] = parts
                    dictionary[rootStem].append(
                        [stem, tag, ' '.join(enGloss.split(';'))])

        return dictionary

    def verify(self, word):
        if self.stopWordsIndex.access(word):
            return True

    def setStopWordsIndex(self, index: ArabicStopWordsIndex):
        self.stopWordsIndex = index
        self.stopWordsIndex.buildIndex()
コード例 #10
0
bigram_data = [bigrams_model[doc] for doc in no_stopwords_data]
print(bigram_data[0:2])

############# (5)lemmatizing the data #############
# produces a list of lists of the data lemmatized ... the lemmatizer does not work well when lemmatizing suffixes
stemmer = ISRIStemmer()

lemmatized_data = []
for items in bigram_data:
    lemmas = []
    for token in items:
        token = stemmer.pre32(
            token)  # removes the three-letter and two-letter prefixes
        token = stemmer.suf32(
            token)  # removes the three-letter and two-letter suffixes
        token = stemmer.norm(token, num=1)  # removes diacritics
        lemmas.append(token)
    lemmatized_data.append(lemmas)
print(lemmatized_data[0:2])

############# (5) Preparing the data using gensim for the Model #############
# the preprocess using gensim involves buidling the dictionary, the corpus and the bigrams
# the data is (data_after_lemmatization) and it is a list of lists

# The Dictionary
dictionary = corpora.Dictionary(lemmatized_data)

# the corpus
corpus = [dictionary.doc2bow(d) for d in lemmatized_data]
print(corpus[0:2])
コード例 #11
0
def remove_diacritics(text):
	arstemmer = ISRIStemmer()
	result = arstemmer.norm(text, num=1) #  remove diacritics which representing Arabic short vowels
	return result
コード例 #12
0
def remove_diacritics(text):
    arstemmer = ISRIStemmer()
    result = arstemmer.norm(
        text,
        num=1)  #  remove diacritics which representing Arabic short vowels
    return result