Exemple #1
0
    def clean(self, strng):
        """
        clean a string from unnecessary whitespaces
        """
        #if type(strng) == str or type(strng) == unicode:
        if type(strng) == str:  #python3
            strng = araby.strip_tatweel(strng)
            return re.sub(u'\s+', ' ', strng).strip()
        if type(strng) == list:
            l = [re.sub(u'\s+', ' ', s).strip() for s in strng]
            return [araby.strip_tatweel(s) for s in l]

        else:
            return strng
    def normalizeText(self, text):
        normalized_text = araby.strip_tatweel(text)
        normalized_text = araby.strip_tashkeel(normalized_text)
        normalized_text = araby.strip_harakat(normalized_text)
        normalized_text = araby.normalize_hamza(normalized_text)

        return normalized_text
Exemple #3
0
def grouping_letter_diacritics(sentance):
    """Grouping each letter with its diacritics.

        Args:
            sentance: str

        Returns:
            [str]: a list of _x_, where _x_ is the letter accompanied with its
            diacritics.

    Example:
    ```python
    q.grouping_letter_diacritics('إِنَّا أَعْطَيْنَكَ الْكَوْثَرَ')\n
    >>> ['إِ', 'نَّ', 'ا', ' ', 'أَ', 'عْ', 'طَ', 'يْ', 'نَ', 'كَ', ' ', 'ا', 'لْ', 'كَ', 'وْ', 'ثَ', 'رَ']
    ```
    """
    sentance_without_tatweel = strip_tatweel(sentance)
    print(sentance_without_tatweel)
    hroof_with_tashkeel = []
    for index, i in enumerate(sentance):
        if ((sentance[index] in (alphabet or alefat or hamzat)
             or sentance[index] is ' ')):
            k = index
            harf_with_taskeel = sentance[index]
            while ((k + 1) != len(sentance)
                   and (sentance[k + 1] in (tashkeel or harakat or shortharakat
                                            or tanwin))):
                harf_with_taskeel = harf_with_taskeel + "" + sentance[k + 1]
                k = k + 1
            index = k
            hroof_with_tashkeel.append(harf_with_taskeel)
    return hroof_with_tashkeel
 def lookup(self, word):
     """
     Lookup if the word is correct or not
     @param word: input word
     @type  word: unicode
     @return: True if word exists else False
     @rtype: Boolean
     """
     if not word: 
         return True
     if word.isdigit():
         return True
     for c in word:
         if c in string.punctuation:
             return True
     # test if the word is previouslly spelled
     # can get True or False
     if word in self.worddict:
         test = self.worddict.get(word, False)
     else:
         # if the word is not spelled
         word = araby.strip_tatweel(word)
         self.stemmer.segment(word)        
         # extract the affix 
         stem = self.stemmer.get_stem()
         affix = u"-".join([self.stemmer.get_prefix(), self.stemmer.get_suffix()])
         # lookup in the database
         test = self.database.lookup(word, stem, affix)
         self.worddict[word] = test
     return test
Exemple #5
0
def extract_poem():
    """[summary]
    might not be using it
    Returns:
        [type] -- [description]
        dict    --  postings lists of arabic words
    """

    postings_list = {}
    tokens = []
    with open("short story.txt", encoding='utf-8') as ofile:

        for loc, line in enumerate(ofile, 1):

            words = araby.tokenize(araby.strip_tashkeel(line))
            tokens.extend(words)
            for word in words:
                if (araby.is_tatweel(word)):
                    word = araby.strip_tatweel(word)

                # if word not in postings_list:

                #     postings_list[word]=[loc]

                # else:

                #     postings_list[word].append(loc)

    #return postings_list
    return tokens
Exemple #6
0
    def lookup(self, word):
        """
        Lookup if the word is correct or not
        @param word: input word
        @type  word: unicode
        @return: True if word exists else False
        @rtype: Boolean
        """
        if not word:
            return True
        # test if the word is previouslly spelled
        # can get True or False
        if word in self.worddict:
            test = self.worddict.get(word, False)
        else:
            # if the word is not spelled
            word = araby.strip_tatweel(word)
            if word.startswith(araby.WAW) or word.startswith(araby.FEH):
                conjonction = word[0]
                word = word[1:]
            else:
                conjonction = u""

            self.stemmer.segment(word)
            # extract the affix
            stem = self.stemmer.get_stem()
            affix = u"-".join(
                [self.stemmer.get_prefix(),
                 self.stemmer.get_suffix()])
            # lookup in the database
            test = self.database.lookup(word, stem, affix, conjonction)
            self.worddict[word] = test
        return test
Exemple #7
0
 def lookup(self, word):
     """
     Lookup if the word is correct or not
     @param word: input word
     @type  word: unicode
     @return: True if word exists else False
     @rtype: Boolean
     """
     if not word: 
         return True
     # test if the word is previouslly spelled
     # can get True or False
     if word in self.worddict:
         test = self.worddict.get(word, False)
     else:
         # if the word is not spelled
         word = araby.strip_tatweel(word)
         self.stemmer.segment(word)        
         # extract the affix 
         stem = self.stemmer.get_stem()
         affix = u"-".join([self.stemmer.get_prefix(), self.stemmer.get_suffix()])
         # lookup in the database
         test = self.database.lookup(word, stem, affix)
         self.worddict[word] = test
     return test
Exemple #8
0
    def normalize_arabic(self, text):
        text = re.sub(r'\bال(\w\w+)', r'\1', text)  # remove al ta3reef
        text = re.sub("[إأآاٱ]", "ا", text)
        text = re.sub("ى", "ي", text)
        text = re.sub("ة", "ه", text)  # replace ta2 marboota by ha2
        text = re.sub("گ", "ك", text)
        text = strip_tatweel(text)  #remove tatweel
        text = strip_tashkeel(text)  #remove tashkeel

        return text
 def preprocess(sentence):
     sentence = araby.strip_tatweel(sentence)
     sentence = sentence.replace(
         araby.SMALL_ALEF+araby.ALEF_MAKSURA, araby.ALEF_MAKSURA)
     sentence = sentence.replace(
         araby.ALEF_MAKSURA+araby.SMALL_ALEF, araby.ALEF_MAKSURA)
     sentence = re.sub(ALEFAT_PATTERN, araby.ALEF, sentence)
     sentence = araby.normalize_ligature(sentence)
     sentence = araby.normalize_teh(sentence)
     sentence = araby.strip_tashkeel(sentence)
     sentence = re.sub(r'[^\d\w]', r' ', sentence)
     sentence = re.sub(r'( ){2,}', r'\1', sentence)
     return sentence
Exemple #10
0
 def word_tagging(self, word_list):
     """
     Guess word classification, into verb, noun, stopwords.
     return al list of guessed tags
     @param word_list: the given word lists.
     @type word_list: unicode list.
     @return: a tag list : 't': tool, 'v': verb, 
     'n' :noun, 'nv' or 'vn' unidentifed.
     @rtype: unicode list 
     """
     if len(word_list) == 0:
         return []
     else:
         list_result = []
         previous = u""
         second_previous = u""  # the second previous
         #~ previous_tag  = ""
         for word in word_list:
             word_nm = araby.strip_tashkeel(word)
             word_nm = araby.strip_tatweel(word_nm)
             tag = ''
             if self.cache.has_key(word):
                 tag = self.cache.get(word, '')
             else:
                 if self.is_stopword(word_nm):
                     tag = 't'
                 else:
                     if self.is_noun(word):
                         tag += 'n'
                     if self.is_verb(word):
                         tag += 'v'
                 # add the found tag to Cache.
                 self.cache[word] = tag
             # if the tagging give an ambigous tag,
             # we can do an contextual analysis
             # the contextual tag is not saved in Cache,
             # because it can be ambigous.
             # for example
             # في ضرب : is a noun
             # قد ضرب : is a verb
             if tag in ("", "vn", "nv"):
                 tag = self.context_analyse(previous, word) + "3"
                 if tag in ("", "1", "vn1", "nv1"):
                     tag = self.context_analyse(
                         u" ".join([second_previous, previous]), word) + "2"
             list_result.append(tag)
             second_previous = previous
             previous = word_nm
             #~ previous_tag  = tag
         return list_result
Exemple #11
0
def normalize(word, wordtype="affix"):
    """
    Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs.
    @param word: given word.
    @type word: unicode.
    @param type: if the word is an affix
    @type type: unicode.
    @return: converted word.
    @rtype: unicode.
    """
    # تحويل الكلمة إلى شكلها النظري.
    # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء
    # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة
    # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر
    # أمثلة
    # إملائي        نظري
    #إِمْلَائِي        ءِمْلَاءِي
    #سَاَلَ        سَءَلَ
    # الهدف : تحويل الكلمة إلى شكل نظري،
    #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء،
    #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد.
    #الفرضية: الكلمات المدخلة مشكولة شكلا تاما.
    #الطريقة:
    # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر
    # 1-فك الإدغام
    i = 0
    #   strip tatweel
    # the tatweel is used to uniformate the affix
    # when the Haraka is used separetely
    if wordtype != "affix":
        word = araby.strip_tatweel(word)


## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى
    if word.startswith(ALEF_MADDA):
        word = normalize_alef_madda(word)

    # ignore harakat at the begin of the word
    len_word = len(word)
    while i < len_word and araby.is_shortharaka(word[i]):  # in HARAKAT:
        i += 1
    word = word[i:]
    # convert all Hamza from into one form
    word = araby.normalize_hamza(word)
    #Convert All LAM ALEF Ligature into separate letters
    word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF)
    word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE)
    word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE)
    return word
 def create_arabic_node(self, cluster_name, label, **kwargs):
     """
     Checks that label is an arabic string, removes tatweel and normalizes 
     ligatures. Adds unvocalized_label.
     
     """
     label = araby.normalize_ligature(araby.strip_tatweel(label))
     label = label.replace(araby.SMALL_ALEF, "")
     if not araby.is_arabicstring(label):
         raise RuntimeError("'%s' is not an Arabic string" % label)
     
     if "unvocalized_label" not in kwargs:
         kwargs["unvocalized_label"] = araby.strip_tashkeel(label)
     
     return self.create_node(cluster_name, label, **kwargs)
Exemple #13
0
    def create_arabic_node(self, cluster_name, label, **kwargs):
        """
        Checks that label is an arabic string, removes tatweel and normalizes 
        ligatures. Adds unvocalized_label.
        
        """
        label = araby.normalize_ligature(araby.strip_tatweel(label))
        label = label.replace(araby.SMALL_ALEF, "")
        if not araby.is_arabicstring(label):
            raise RuntimeError("'%s' is not an Arabic string" % label)

        if "unvocalized_label" not in kwargs:
            kwargs["unvocalized_label"] = araby.strip_tashkeel(label)

        return self.create_node(cluster_name, label, **kwargs)
Exemple #14
0
    def test_strip(self):

        # strip_harakat(text):
        assert Araby.strip_harakat(u"الْعَرَبِيّةُ") == u'العربيّة'

        # strip_lastharaka(text)
        assert Araby.strip_lastharaka(u"الْعَرَبِيّةُ") == u'الْعَرَبِيّة'

        # strip_tashkeel(text)
        assert Araby.strip_tashkeel(u"الْعَرَبِيّةُ") == u'العربية'

        # strip_tatweel(text):
        assert Araby.strip_tatweel(u"العـــــربية") == u'العربية'

        # strip_shadda(text):
        assert Araby.strip_shadda(u"الشّمسيّة") == u'الشمسية'
Exemple #15
0
def normalize(word, wordtype = "affix"):
    """
    Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs.
    @param word: given word.
    @type word: unicode.
    @param type: if the word is an affix
    @type type: unicode.
    @return: converted word.
    @rtype: unicode.
    """
# تحويل الكلمة إلى شكلها النظري.
# الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء
# الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة
# ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر
# أمثلة
# إملائي        نظري
#إِمْلَائِي        ءِمْلَاءِي
#سَاَلَ        سَءَلَ
# الهدف : تحويل الكلمة إلى شكل نظري، 
#ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء،
#وبعد التصريف يتم تطبيق قواعد الإملاء من جديد.
#الفرضية: الكلمات المدخلة مشكولة شكلا تاما.
#الطريقة:
# 1-تحويل جميع أنواع الهمزات إلى همزة على السطر
# 1-فك الإدغام
    i = 0
#   strip tatweel
# the tatweel is used to uniformate the affix 
# when the Haraka is used separetely
    if wordtype != "affix":
        word = araby.strip_tatweel(word)
## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى
    if word.startswith(ALEF_MADDA):
        word = normalize_alef_madda(word)

    # ignore harakat at the begin of the word
    len_word = len(word)
    while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT:
        i += 1
    word = word[i:]
    # convert all Hamza from into one form
    word = araby.normalize_hamza(word)
    #Convert All LAM ALEF Ligature into separate letters
    word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF)
    word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE)
    word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE)
    return word
def preprocess(sentences, stopwords, isStopword = False):
  """
    This takes in an array of complete araic sentences, and performs th following operations on all of them:
        1.) strips tashkeel
        2.) strips harakat
        3.) strips lastharaka
        4.) strips tatweel
        5.) Strips shadda
        6.) normalize lam alef ligatures 
        7.) normalize hamza
        8.) tokenize

    Returns a 2D martix, where each row represents normalized, tokens of each sentence
  """
  #print("SENTENCE INDEX!!!", sentences[0])
  output = []
  for sentence in sentences:
    #print("Before Preprocessing:"+ sentence)
    #print(sentence)
    text = araby.strip_harakat(sentence)
    #print("TEXT!!!!", text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_lastharaka(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_hamza(text)
    text = clean_str(text)
    #print("After Preprocessing:"+ text)
    #print("----")
    #print(text)
    try:
      text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group()
      tokens = araby.tokenize(text)
      if not isStopword:
        tokens = remove_stopwords(stopwords, tokens)
      tokens = [t for t in tokens if t != '\n']
      output.append(tokens)
    except:
      pass
  
  return output
Exemple #17
0
def soundex(sentence):
    '''
    - blanks and and spaces are deleted,
    - long vowels are deleted,
    - if two adjacent letters are identical, only one of the two is
        kept,
    - to each word’s letter are associated two numbers:

        1. the first one corresponds to the letter’s main category’s
        code. It is represented by an integer N of two bits, such as N
        E={0, 1, 2}.
        
        2. the second one corresponds to the letter’s sub-category’s
        code. It is represented by an integer n of four bits, such as:
        n SE= {0,...,10}.
        Thus :
            Given a word w, w=w 1 ...w n .
                w=w- {blanks and long vowels}=w’ 1 ... w’ n .
                f(w)=f(w’)=f(w’ 1 ... w’ n )=N 1 n 1 ... N n n n =X.
                The phonetic code generated X, can be used as a hash key
                for classifying and indexing purposes.
    
    Returns:
        [type] -- [description]
    '''
    words = araby.tokenize(sentence)
    for word in words:
        if type(word) != str:
            word = word.decode('utf-8')
        else:
            word = word.encode('utf-8')
        loc = 0
        for i in word[0:]:
            if araby.is_tatweel(i):
                word = araby.strip_tatweel(word)
            if loc < len(word) and loc != 0 and re.match(
                    "[\u0627\u064a\u0648]", str(word[loc])):
                word = word[:loc] + word[loc + 1:]
            if loc < len(word) and re.match("[\u0640]", str(word[loc])):
                word = word[:loc] + word[loc + 1:]
            loc += 1
    print(words)
Exemple #18
0
                   for key, group in groupby(aa5irHarf)]
print(freqOfAa5irHarf)
import collections
counter = collections.Counter(aa5irHarf)
print(counter)
# Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1})
print(counter.values())
# [4, 4, 2, 1, 2]
print(counter.keys())
# [1, 2, 3, 4, 5]
print(counter.most_common(3))
# [(1, 4), (2, 4), (3, 2)]
print(counter.most_common(1))
kkey = counter.most_common(1)
#we should write to file or save it anywhere
#and also we should generalize it to all poems for each poet

#القافية :آخر ساكن وبدور عالساكن اللي قبله مع الحرف المتحرك اللي قبل الساكن ال ما قبل الاخير
print('********** Al Qafiya ************')
for line in f:
    line1 = araby.strip_tatweel(line)
    letters, hrkat = araby.separate(line1)
    #print(letters.encode('utf8'))
    for m in hrkat:
        #لازم نعمل تعديلات
        if not araby.is_tatweel(m):
            print(araby.name(m))
            print(''.join(m))

#Most Common Words بنعملهم بكل قصائد الشاعر
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
u"سئل لأنه يؤم الإمام"
]
word1=u""
for word in word_list:
    print (word)
    if araby.is_vocalized(word): print (' is vocalized')
    if araby.is_vocalizedtext(word): print (' is vocalized text')
    if araby.is_arabicword(word): print (' is valid word')
    else: print ("invalid arabic word")
    print (' strip harakat', araby.strip_harakat(word))
    print (' strip tashkeel', araby.strip_tashkeel(word))
    print (' strip tatweel',araby.strip_tatweel(word))
    print (' normalize ligature ', araby.normalize_ligature(word))
    print (' normalize hamza', araby.normalize_hamza(word))
    if araby.vocalizedlike(word, word1): print ("vocalized_like")
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")

Exemple #20
0
                 tokens.extend(araby.tokenize(line))
     for token in tokens:
         if len(token) > 1:
             soundex(token)    """
 with io.open("nouns.masdarv2.txt", encoding="utf-8") as doc:
     for line in doc:
         temptokens.extend(araby.tokenize(line))
     for token in temptokens:
         if len(token) >= 2 and araby.is_arabicrange(token):
             token = token.replace("\u0627", '')
             token = token.replace('\u0621', '\u0627')
             token = token.replace('\u0621\u0621', '\u0627')
             token = token.replace("\u0648", '')
             token = token.replace("\u064a", '')
             if len(token) >= 2:
                 tokens.append(
                     araby.strip_tashkeel(araby.strip_tatweel(token)))
     del temptokens
     for token in tokens:
         soundex(token[::-1])
     del tokens
 soundex(araby.MEEM + araby.DAL +
         araby.KAF)  #sample word, backwards because it is processed LTR
 soundex(araby.BEH + araby.TEH +
         araby.KAF)  #sample word, backwards because it is processed LTR
 soundex(araby.FEH + araby.TEH +
         araby.KAF)  #sample word, backwards because it is processed LTR
 soundex(araby.FEH + araby.NOON + araby.ALEF_HAMZA_ABOVE
         )  #sample word, backwards because it is processed LTR
 with open("db.json", 'w') as database:
     database.write(json.dumps(word_db))
Exemple #21
0
 def removeTatweel(self, t):
     return strip_tatweel(t)
Exemple #22
0
 def word_tagging(self, word_list):
     """
     Guess word classification, into verb, noun, stopwords.
     return al list of guessed tags
     
     Example:
         >>> import naftawayh.wordtag 
         >>> word_list=(u'بالبلاد', u'بينما', u'أو', u'انسحاب', u'انعدام', 
         u'انفجار', u'البرنامج', u'بانفعالاتها', u'العربي', u'الصرفي', 
         u'التطرف', u'اقتصادي', )
         >>> tagger = naftawayh.wordtag.WordTagger();
         >>> # test all words
         >>> list_tags = tagger.word_tagging(word_list)
         >>> for word, tag in zip(word_list, list_tags):
         >>>     print word, tag
         بالبلاد n
         بينما vn3
         أو t
         انسحاب n
         انعدام n
         انفجار n
         البرنامج n
         بانفعالاتها n
         العربي n
         الصرفي n
         التطرف n
         اقتصادي n
     
     @param word_list: the given word lists.
     @type word_list: unicode list.
     @return: a tag list : 't': tool, 'v': verb,
     'n' :noun, 'nv' or 'vn' unidentifed.
     @rtype: unicode list
     """
     if len(word_list) == 0:
         return []
     else:
         list_result = []
         previous = u""
         second_previous = u""  # the second previous
         #~ previous_tag  = ""
         for word in word_list:
             word_nm = araby.strip_tashkeel(word)
             word_nm = araby.strip_tatweel(word_nm)
             tag = ''
             if self.cache.has_key(word):
                 tag = self.cache.get(word, '')
             else:
                 if self.is_stopword(word_nm):
                     tag = 't'
                 else:
                     if self.is_noun(word):
                         tag += 'n'
                     if self.is_verb(word):
                         tag += 'v'
                 # add the found tag to Cache.
                 self.cache[word] = tag
             # if the tagging give an ambigous tag,
             # we can do an contextual analysis
             # the contextual tag is not saved in Cache,
             # because it can be ambigous.
             # for example
             # في ضرب : is a noun
             # قد ضرب : is a verb
             if tag in ("", "vn", "nv"):
                 tag = self.context_analyse(previous, word) + "3"
                 if tag in ("", "1", "vn1", "nv1"):
                     tag = self.context_analyse(
                         u" ".join([second_previous, previous]), word) + "2"
             list_result.append(tag)
             second_previous = previous
             previous = word_nm
             #~ previous_tag  = tag
         return list_result
Exemple #23
0
    def _preprocess_v2(self, text: str) -> str:
        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in URL_REGEXES:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in EMAIL_REGEXES:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(USER_MENTION_REGEX, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        if self.map_hindi_numbers_to_arabic:
            text = text.translate(HINDI_TO_ARABIC_MAP)

        # remove repeated characters >2
        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
                r" \1 ",
                text,
            )

            # insert whitespace between words and numbers or numbers and words
            text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)",
                          r" \1 \2 ", text)
            text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)",
                          r" \1 \2 ", text)

        if self.replace_slash_with_dash:
            text = text.replace("/", "-")

        # remove unwanted characters
        text = re.sub(self.REJECTED_CHARS_REGEX, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if (self.model_name == "bert-base-arabertv2"
                or self.model_name == "bert-large-arabertv2"):
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text
Exemple #24
0
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print word.encode('utf8'), '\t',
    if araby.is_vocalized(word): print ' is vocalized',
    ##    if araby.isArabicstring(word): print ' iisArabicstring',
    ##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode(
        'utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like",
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
Exemple #25
0
def strip_tatweel(text):

    reduced = araby.strip_tatweel(text)
    return reduced
Exemple #26
0
    def check_word(self, word, guessedtag = ""):
        """
        Analyze one word morphologically as verbs
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        
        word = araby.strip_tatweel(word)
        word_vocalised = word
        word_nm = araby.strip_tashkeel(word)
        # get analysed details from cache if used
        if self.allow_cache_use and self.cache.isAlreadyChecked(word_nm):
            #~ print (u"'%s'"%word).encode('utf8'), 'found'
            resulted_data = self.cache.getChecked(word_nm)
        else:
            resulted_data = []
            # if word is a pounctuation
            resulted_data += self.check_word_as_pounct(word_nm)
            # Done: if the word is a stop word we have  some problems, 
            # the stop word can also be another normal word (verb or noun), 
            # we must consider it in future works
            # if word is stopword allow stop words analysis
            resulted_data += self.check_word_as_stopword(word_nm)

            #if word is verb
            # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
            #~if  self.tagger.has_verb_tag(guessedtag) or \
            #~self.tagger.is_stopword_tag(guessedtag):
                #~resulted_data += self.check_word_as_verb(word_nm)
            resulted_data += self.check_word_as_verb(word_nm)
                #print "is verb", rabti, len(resulted_data)
            #if word is noun
            #~if self.tagger.has_noun_tag(guessedtag) or \
            #~self.tagger.is_stopword_tag(guessedtag):            
                #~resulted_data += self.check_word_as_noun(word_nm)
            resulted_data += self.check_word_as_noun(word_nm)
            if len(resulted_data) == 0:
                print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8')
                #check the word as unkonwn
                resulted_data += self.check_word_as_unknown(word_nm)
                #check if the word is nomralized and solution are equivalent
            resulted_data = check_normalized(word_vocalised, resulted_data)
            #check if the word is shadda like
            resulted_data = check_shadda(word_vocalised, resulted_data)


            # add word frequency information in tags
            resulted_data = self.add_word_frequency(resulted_data)

            # add the stemmed words details into Cache
            data_list_to_serialize = [w.__dict__ for w in resulted_data]
            if self.allow_cache_use:
                self.cache.addChecked(word_nm, data_list_to_serialize)

        #check if the word is vocalized like results 
        if self.partial_vocalization_support:
            resulted_data = check_partial_vocalized(word_vocalised, resulted_data)

        if len(resulted_data) == 0:
            resulted_data.append(wordcase.WordCase({
            'word':word, 
            'affix': ('' , '', '', ''),     
            'stem':word, 
            'original':word, 
            'vocalized':word, 
            'semivocalized':word, 
            'tags':u'', 
            'type':'unknown', 
            'root':'', 
            'template':'', 
            'freq':self.wordfreq.get_freq(word, 'unknown'), 
            'syntax':'', 
            })
            )
        return resulted_data
Exemple #27
0

# remove stop word
with open('clean_tweet.txt', 'r') as inFile, open('removeStopWordFile.txt', 'w') as outFile:
    for line in inFile.readlines():
        print(" ".join([word for word in line.split()
                        # if the word not in stop word write to file
                        if word not in stopwords.words('arabic', 'UTF-8')]), file=outFile)
inFile.close()
outFile.close()

with open('removeStopWordFile.txt', 'r') as f_input, open('RemoveT&T.txt', 'w') as f_output:
    for line in f_input:
        st = line
#مكه become مكــــــــــــــه
        x = araby.strip_tatweel(st)
#ٍRemove َ ً ُِ ٌ
        y = araby.strip_tashkeel(x)
 # write to file
        f_output.write(y)

f_input.close()
f_output.close()

# steaming the word
with open('RemoveT&T.txt', 'r') as f_input, open('steam.txt', 'w') as f_output:
    for line in f_input.read().split("\n"):

        sentence = u"" + line
        for word in sentence.split(" "):
            if word[-1:] == 'ة':
Exemple #28
0
    def check_word(self, word, guessedtag = ""):
        """
        Analyze one word morphologically as verbs
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        
        word = araby.strip_tatweel(word)
        word_vocalised = word
        word_nm = araby.strip_tashkeel(word)
        # get analysed details from cache if used
        if self.allow_cache_use and self.cache.isAlreadyChecked(word_nm):
            #~ print (u"'%s'"%word).encode('utf8'), 'found'
            resulted_data = self.cache.getChecked(word_nm)
        else:
            resulted_data = []
            # if word is a pounctuation
            resulted_data += self.check_word_as_pounct(word_nm)
            # Done: if the word is a stop word we have  some problems, 
            # the stop word can also be another normal word (verb or noun), 
            # we must consider it in future works
            # if word is stopword allow stop words analysis
            resulted_data += self.check_word_as_stopword(word_nm)

            #if word is verb
            # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
            #~if  self.tagger.has_verb_tag(guessedtag) or \
            #~self.tagger.is_stopword_tag(guessedtag):
                #~resulted_data += self.check_word_as_verb(word_nm)
            resulted_data += self.check_word_as_verb(word_nm)
                #print "is verb", rabti, len(resulted_data)
            #if word is noun
            #~if self.tagger.has_noun_tag(guessedtag) or \
            #~self.tagger.is_stopword_tag(guessedtag):            
                #~resulted_data += self.check_word_as_noun(word_nm)
            resulted_data += self.check_word_as_noun(word_nm)
            if len(resulted_data) == 0:
                print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8')
                #check the word as unkonwn
                resulted_data += self.check_word_as_unknown(word_nm)
                #check if the word is nomralized and solution are equivalent
            resulted_data = check_normalized(word_vocalised, resulted_data)
            #check if the word is shadda like
            resulted_data = check_shadda(word_vocalised, resulted_data)


            # add word frequency information in tags
            resulted_data = self.add_word_frequency(resulted_data)

            # add the stemmed words details into Cache
            data_list_to_serialize = [w.__dict__ for w in resulted_data]
            if self.allow_cache_use:
                self.cache.addChecked(word_nm, data_list_to_serialize)

        #check if the word is vocalized like results 
        if self.partial_vocalization_support:
            resulted_data = check_partial_vocalized(word_vocalised, resulted_data)

        if len(resulted_data) == 0:
            resulted_data.append(wordcase.WordCase({
            'word':word, 
            'affix': ('' , '', '', ''),     
            'stem':'', 
            'original':word, 
            'vocalized':word, 
            'tags':u'', 
            'type':'unknown', 
            'root':'', 
            'template':'', 
            'freq':self.wordfreq.get_freq(word, 'unknown'), 
            'syntax':'', 
            })
            )
        return resulted_data
Exemple #29
0
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
##    if araby.isArabicstring(word): print ' iisArabicstring',
##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print;
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like",
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
 def strip_text(self, text):
     return araby.strip_tatweel(araby.strip_tashkeel(text))
Exemple #31
0
def run_diac(gomla, dialect):
    sos = 'بدايةجملة' if dialect == 'ca' else 'بداية'
    eos = 'نهايةجملة' if dialect == 'ca' else 'نهاية'
    token_list_7 = LastNTokens(7, sos)

    fname = randint(0, 100000)
    with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.fmt',
                     mode='w',
                     encoding='utf-8') as infile:
        gomla = strip_tatweel(araby.normalize_ligature(gomla))
        # gomla_list = araby.tokenize(gomla.replace('_', '-'), conditions=araby.is_arabicrange, morphs=araby.strip_tashkeel)
        gomla_list = araby.tokenize(gomla.replace('_', '-'),
                                    morphs=araby.strip_tashkeel)
        # gomla_list = gomla.strip().split()

        for token in gomla_list:
            t = ' '.join(token)
            token_list_7.add_tokens_list(t, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')
        else:
            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

    if dialect == 'ca':
        ca_runner.infer(
            f"diacritizer/userdata/ca/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/ca/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)
    elif dialect == 'msa':
        msa_runner.infer(
            f"diacritizer/userdata/msa/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/msa/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)
    elif dialect == 'tun':
        tn_runner.infer(
            f"diacritizer/userdata/tun/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/tun/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)
    elif dialect == 'mor':
        ma_runner.infer(
            f"diacritizer/userdata/mor/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/mor/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)

    with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.rlt',
                     mode='r',
                     encoding='utf-8') as outfile:
        diacritized_tokens = list()
        counters = defaultdict(Counter)
        for i, line in enumerate(outfile):
            dtokens = line.strip().split(' _ ')
            # print(len(dtokens), dtokens)
            for j, _ in enumerate(dtokens):
                tk = dtokens[j - 1 - i % 7]

                if tk not in [eos, sos]:
                    counters[j].update([tk])

                if sum(counters[j].values()) >= 7:
                    diacritized_tokens.append(
                        counters[j].most_common(1)[0][0].replace(' ', ''))
                    counters[j].clear()
        else:
            return ' '.join(diacritized_tokens)
Exemple #32
0
    def preprocess(self, text):
        """
        Preprocess takes an input text line an applies the same preprocessing used in AraBERT
                            pretraining

        Args:

            text (:obj:`str`): inout text string

        Returns:

            string: A preprocessed string depending on which model was selected
        """
        if self.model_name == "bert-base-arabert":
            return self._old_preprocess(
                text,
                do_farasa_tokenization=True,
            )

        if self.model_name == "bert-base-arabertv01":
            return self._old_preprocess(text, do_farasa_tokenization=False)

        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in url_regexes:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in email_regexes:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(user_mention_regex, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        # remove repeated characters >2
        if self.remove_elongation:
            text = self._remove_elongation(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
                r" \1 ",
                text,
            )

            # insert whitespace between words and numbers or numbers and words
            text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)",
                          r" \1 \2 ", text)
            text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)",
                          r" \1 \2 ", text)

        # remove unwanted characters
        if self.keep_emojis:
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
            rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
            text = re.sub(rejected_chars_regex2, " ", text)
        else:
            text = re.sub(rejected_chars_regex, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if (self.model_name == "bert-base-arabertv2"
                or self.model_name == "bert-large-arabertv2"):
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text
Exemple #33
0
    def from_url(url):
        """ Initialize a Question (written or oral) from its public URL
        e.g.: https://www.chambredesrepresentants.ma/ar/%D9%85%D8%B1%D8%A7%D9%82%D8%A8%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D9%84-%D8%A7%D9%84%D8%AD%D9%83%D9%88%D9%85%D9%8A/%D8%A7%D9%84%D8%A3%D8%B3%D9%80%D8%A6%D9%84%D8%A9-%D8%A7%D9%84%D9%83%D8%AA%D8%A7%D8%A8%D9%8A%D8%A9/%D8%A7%D9%86%D8%AA%D8%B4%D8%A7%D8%B1-%D8%A7%D9%84%D8%AD%D8%B4%D8%B1%D8%A9-%D8%A7%D9%84%D9%82%D8%B1%D9%85%D8%B2%D9%8A%D8%A9-%D8%A8%D9%86%D8%A8%D8%A7%D8%AA-%D8%A7%D9%84%D8%B5%D8%A8%D8%A7%D8%B1
        Note: only the arabic version is currently supported."""

        r = requests.get(url)
        s = BeautifulSoup(r.text, 'html.parser')

        try:
            # Note : strip_tatweel below is mandatory, it took me several hours of scratching
            title = araby.strip_tatweel(
                s.find_all(
                    'h1', class_='section-title title-global lang_ar')[0].text)
            title.lstrip().rstrip()

            qtype = 'undefined'
            if arabic_string_eq(title, "الأسئلة الكتابية"):
                qtype = 'written'
            elif arabic_string_eq(title, "الأسئلة الشفوية"):
                qtype = 'oral'

            spans = s.find_all(class_='q-b1-1')[0].find_all('span')
            status = 'unknown'
            if 'q-st-red' in spans[4]['class']:
                status = 'unanswered'
            elif 'q-st-green' in spans[4]['class']:
                status = 'answered'

            if status == 'unanswered':
                content = s.find_all(class_='q-block1-na')[0]
            elif status == 'answered':
                content = s.find_all(class_='q-block1')[0]
            else:
                raise ValueError("Malformed HTML")

            qb11 = content.find_all(class_='q-b1-1')[0].find_all('span')

            # XXX How do we handle RE faulty cases ?
            p = re.compile('رقم السؤال : ([0-9]+)')
            res = p.match(qb11[0].text.lstrip().rstrip())
            id = int(res.group(1))

            p = re.compile('الموضوع : (.*)')
            res = p.match(qb11[1].text.lstrip().rstrip())
            topic = res.group(1)

            answer_date_raw = ''
            if arabic_string_eq(qb11[2].text.split(':')[0].lstrip().rstrip(),
                                'تاريخ الجواب'):
                answer_date_raw = qb11[3].text.lstrip().rstrip()
            answer_date = format_raw_date_to_isoformat(answer_date_raw)

            qb12 = content.find_all(class_='q-b1-2 row')[0]
            team = qb12.find_all(class_='col-md-5')[0].find_all('a')[0].text
            questioners = []
            for q in qb12.find_all(class_='col-md-7')[0].find_all(
                    class_='q-b1-2-s-item'):
                questioners.append(
                    q.find_all(
                        class_='q-name')[0].find('a').text.lstrip().rstrip())

            qb13 = content.find_all(class_='q-b1-3')[0]

            offset = 0
            designated_ministry = ''
            # XXX : are there cases where there are multiple ministries involved?
            if arabic_string_eq(
                    qb13.find_all('div')[0].text.split(':')
                [0].lstrip().rstrip(), 'الوزارة المختصة'):
                designated_ministry = qb13.find_all('div')[0].text.split(
                    ':')[1].replace('\n', '').lstrip().rstrip()
                offset = 1

            question_date_raw = ''
            question_text = ''
            if qtype == "written":
                if arabic_string_eq(
                        qb13.find_all('div')[offset].text.split(':')
                    [0].lstrip().rstrip(), 'تاريخ السؤال'):
                    question_date_raw = qb13.find_all(
                        'div')[offset].text.split(':')[1].replace(
                            '\n', '').lstrip().rstrip()

                if arabic_string_eq(
                        qb13.find_all('div')[offset + 1].text.split(':')
                    [0].lstrip().rstrip(), 'السؤال'):
                    question_text = qb13.find_all('div')[offset + 1].find(
                        'p').text.lstrip().rstrip()
            elif qtype == "oral":
                # TODO: check if  there are cases where the date is given
                if arabic_string_eq(
                        qb13.find_all('div')[offset].text.split(':')
                    [0].lstrip().rstrip(), 'السؤال'):
                    question_text = qb13.find_all('div')[offset].find(
                        'p').text.lstrip().rstrip()

            question_date = format_raw_date_to_isoformat(question_date_raw)

            answer_doc = ''
            answer_content = s.find_all(class_='q-block2')
            if answer_content != []:
                answer_doc = answer_content[0].find_all('a',
                                                        href=True)[0]['href']

            q = Question(id, url, qtype, topic, question_date, question_text,
                         questioners, status, answer_date, answer_doc,
                         designated_ministry)
            return q

        except Exception as e:
            print('There was an exception: %s' % e)
            return None
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
    if araby.is_vocalized(word): print(' is vocalized', end=" ")
    if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ")
    if araby.is_arabicword(word): print(' is valid word', end=" ")
    else: print("invalid arabic word", end=" ")
    print(' strip harakat', araby.strip_harakat(word), end=" ")
    print(' strip tashkeel', araby.strip_tashkeel(word), end=" ")
    print(' strip tatweel', araby.strip_tatweel(word), end=" ")
    print(' normalize ligature ', araby.normalize_ligature(word), end=" ")
    if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ")
    print()
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"):
    print("vocalized_like", end=" ")
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
Exemple #35
0
def strip_text(text):		# removes arabic vowels and stylistic spacing
    return araby.strip_tatweel(araby.strip_tashkeel(text))