def clean(self, strng): """ clean a string from unnecessary whitespaces """ #if type(strng) == str or type(strng) == unicode: if type(strng) == str: #python3 strng = araby.strip_tatweel(strng) return re.sub(u'\s+', ' ', strng).strip() if type(strng) == list: l = [re.sub(u'\s+', ' ', s).strip() for s in strng] return [araby.strip_tatweel(s) for s in l] else: return strng
def normalizeText(self, text): normalized_text = araby.strip_tatweel(text) normalized_text = araby.strip_tashkeel(normalized_text) normalized_text = araby.strip_harakat(normalized_text) normalized_text = araby.normalize_hamza(normalized_text) return normalized_text
def grouping_letter_diacritics(sentance): """Grouping each letter with its diacritics. Args: sentance: str Returns: [str]: a list of _x_, where _x_ is the letter accompanied with its diacritics. Example: ```python q.grouping_letter_diacritics('إِنَّا أَعْطَيْنَكَ الْكَوْثَرَ')\n >>> ['إِ', 'نَّ', 'ا', ' ', 'أَ', 'عْ', 'طَ', 'يْ', 'نَ', 'كَ', ' ', 'ا', 'لْ', 'كَ', 'وْ', 'ثَ', 'رَ'] ``` """ sentance_without_tatweel = strip_tatweel(sentance) print(sentance_without_tatweel) hroof_with_tashkeel = [] for index, i in enumerate(sentance): if ((sentance[index] in (alphabet or alefat or hamzat) or sentance[index] is ' ')): k = index harf_with_taskeel = sentance[index] while ((k + 1) != len(sentance) and (sentance[k + 1] in (tashkeel or harakat or shortharakat or tanwin))): harf_with_taskeel = harf_with_taskeel + "" + sentance[k + 1] k = k + 1 index = k hroof_with_tashkeel.append(harf_with_taskeel) return hroof_with_tashkeel
def lookup(self, word): """ Lookup if the word is correct or not @param word: input word @type word: unicode @return: True if word exists else False @rtype: Boolean """ if not word: return True if word.isdigit(): return True for c in word: if c in string.punctuation: return True # test if the word is previouslly spelled # can get True or False if word in self.worddict: test = self.worddict.get(word, False) else: # if the word is not spelled word = araby.strip_tatweel(word) self.stemmer.segment(word) # extract the affix stem = self.stemmer.get_stem() affix = u"-".join([self.stemmer.get_prefix(), self.stemmer.get_suffix()]) # lookup in the database test = self.database.lookup(word, stem, affix) self.worddict[word] = test return test
def extract_poem(): """[summary] might not be using it Returns: [type] -- [description] dict -- postings lists of arabic words """ postings_list = {} tokens = [] with open("short story.txt", encoding='utf-8') as ofile: for loc, line in enumerate(ofile, 1): words = araby.tokenize(araby.strip_tashkeel(line)) tokens.extend(words) for word in words: if (araby.is_tatweel(word)): word = araby.strip_tatweel(word) # if word not in postings_list: # postings_list[word]=[loc] # else: # postings_list[word].append(loc) #return postings_list return tokens
def lookup(self, word): """ Lookup if the word is correct or not @param word: input word @type word: unicode @return: True if word exists else False @rtype: Boolean """ if not word: return True # test if the word is previouslly spelled # can get True or False if word in self.worddict: test = self.worddict.get(word, False) else: # if the word is not spelled word = araby.strip_tatweel(word) if word.startswith(araby.WAW) or word.startswith(araby.FEH): conjonction = word[0] word = word[1:] else: conjonction = u"" self.stemmer.segment(word) # extract the affix stem = self.stemmer.get_stem() affix = u"-".join( [self.stemmer.get_prefix(), self.stemmer.get_suffix()]) # lookup in the database test = self.database.lookup(word, stem, affix, conjonction) self.worddict[word] = test return test
def lookup(self, word): """ Lookup if the word is correct or not @param word: input word @type word: unicode @return: True if word exists else False @rtype: Boolean """ if not word: return True # test if the word is previouslly spelled # can get True or False if word in self.worddict: test = self.worddict.get(word, False) else: # if the word is not spelled word = araby.strip_tatweel(word) self.stemmer.segment(word) # extract the affix stem = self.stemmer.get_stem() affix = u"-".join([self.stemmer.get_prefix(), self.stemmer.get_suffix()]) # lookup in the database test = self.database.lookup(word, stem, affix) self.worddict[word] = test return test
def normalize_arabic(self, text): text = re.sub(r'\bال(\w\w+)', r'\1', text) # remove al ta3reef text = re.sub("[إأآاٱ]", "ا", text) text = re.sub("ى", "ي", text) text = re.sub("ة", "ه", text) # replace ta2 marboota by ha2 text = re.sub("گ", "ك", text) text = strip_tatweel(text) #remove tatweel text = strip_tashkeel(text) #remove tashkeel return text
def preprocess(sentence): sentence = araby.strip_tatweel(sentence) sentence = sentence.replace( araby.SMALL_ALEF+araby.ALEF_MAKSURA, araby.ALEF_MAKSURA) sentence = sentence.replace( araby.ALEF_MAKSURA+araby.SMALL_ALEF, araby.ALEF_MAKSURA) sentence = re.sub(ALEFAT_PATTERN, araby.ALEF, sentence) sentence = araby.normalize_ligature(sentence) sentence = araby.normalize_teh(sentence) sentence = araby.strip_tashkeel(sentence) sentence = re.sub(r'[^\d\w]', r' ', sentence) sentence = re.sub(r'( ){2,}', r'\1', sentence) return sentence
def word_tagging(self, word_list): """ Guess word classification, into verb, noun, stopwords. return al list of guessed tags @param word_list: the given word lists. @type word_list: unicode list. @return: a tag list : 't': tool, 'v': verb, 'n' :noun, 'nv' or 'vn' unidentifed. @rtype: unicode list """ if len(word_list) == 0: return [] else: list_result = [] previous = u"" second_previous = u"" # the second previous #~ previous_tag = "" for word in word_list: word_nm = araby.strip_tashkeel(word) word_nm = araby.strip_tatweel(word_nm) tag = '' if self.cache.has_key(word): tag = self.cache.get(word, '') else: if self.is_stopword(word_nm): tag = 't' else: if self.is_noun(word): tag += 'n' if self.is_verb(word): tag += 'v' # add the found tag to Cache. self.cache[word] = tag # if the tagging give an ambigous tag, # we can do an contextual analysis # the contextual tag is not saved in Cache, # because it can be ambigous. # for example # في ضرب : is a noun # قد ضرب : is a verb if tag in ("", "vn", "nv"): tag = self.context_analyse(previous, word) + "3" if tag in ("", "1", "vn1", "nv1"): tag = self.context_analyse( u" ".join([second_previous, previous]), word) + "2" list_result.append(tag) second_previous = previous previous = word_nm #~ previous_tag = tag return list_result
def normalize(word, wordtype="affix"): """ Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs. @param word: given word. @type word: unicode. @param type: if the word is an affix @type type: unicode. @return: converted word. @rtype: unicode. """ # تحويل الكلمة إلى شكلها النظري. # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر # أمثلة # إملائي نظري #إِمْلَائِي ءِمْلَاءِي #سَاَلَ سَءَلَ # الهدف : تحويل الكلمة إلى شكل نظري، #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء، #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد. #الفرضية: الكلمات المدخلة مشكولة شكلا تاما. #الطريقة: # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر # 1-فك الإدغام i = 0 # strip tatweel # the tatweel is used to uniformate the affix # when the Haraka is used separetely if wordtype != "affix": word = araby.strip_tatweel(word) ## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) # ignore harakat at the begin of the word len_word = len(word) while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 word = word[i:] # convert all Hamza from into one form word = araby.normalize_hamza(word) #Convert All LAM ALEF Ligature into separate letters word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF) word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE) word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE) return word
def create_arabic_node(self, cluster_name, label, **kwargs): """ Checks that label is an arabic string, removes tatweel and normalizes ligatures. Adds unvocalized_label. """ label = araby.normalize_ligature(araby.strip_tatweel(label)) label = label.replace(araby.SMALL_ALEF, "") if not araby.is_arabicstring(label): raise RuntimeError("'%s' is not an Arabic string" % label) if "unvocalized_label" not in kwargs: kwargs["unvocalized_label"] = araby.strip_tashkeel(label) return self.create_node(cluster_name, label, **kwargs)
def test_strip(self): # strip_harakat(text): assert Araby.strip_harakat(u"الْعَرَبِيّةُ") == u'العربيّة' # strip_lastharaka(text) assert Araby.strip_lastharaka(u"الْعَرَبِيّةُ") == u'الْعَرَبِيّة' # strip_tashkeel(text) assert Araby.strip_tashkeel(u"الْعَرَبِيّةُ") == u'العربية' # strip_tatweel(text): assert Araby.strip_tatweel(u"العـــــربية") == u'العربية' # strip_shadda(text): assert Araby.strip_shadda(u"الشّمسيّة") == u'الشمسية'
def normalize(word, wordtype = "affix"): """ Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs. @param word: given word. @type word: unicode. @param type: if the word is an affix @type type: unicode. @return: converted word. @rtype: unicode. """ # تحويل الكلمة إلى شكلها النظري. # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر # أمثلة # إملائي نظري #إِمْلَائِي ءِمْلَاءِي #سَاَلَ سَءَلَ # الهدف : تحويل الكلمة إلى شكل نظري، #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء، #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد. #الفرضية: الكلمات المدخلة مشكولة شكلا تاما. #الطريقة: # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر # 1-فك الإدغام i = 0 # strip tatweel # the tatweel is used to uniformate the affix # when the Haraka is used separetely if wordtype != "affix": word = araby.strip_tatweel(word) ## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) # ignore harakat at the begin of the word len_word = len(word) while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 word = word[i:] # convert all Hamza from into one form word = araby.normalize_hamza(word) #Convert All LAM ALEF Ligature into separate letters word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF) word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE) word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE) return word
def preprocess(sentences, stopwords, isStopword = False): """ This takes in an array of complete araic sentences, and performs th following operations on all of them: 1.) strips tashkeel 2.) strips harakat 3.) strips lastharaka 4.) strips tatweel 5.) Strips shadda 6.) normalize lam alef ligatures 7.) normalize hamza 8.) tokenize Returns a 2D martix, where each row represents normalized, tokens of each sentence """ #print("SENTENCE INDEX!!!", sentences[0]) output = [] for sentence in sentences: #print("Before Preprocessing:"+ sentence) #print(sentence) text = araby.strip_harakat(sentence) #print("TEXT!!!!", text) text = araby.strip_tashkeel(text) text = araby.strip_lastharaka(text) text = araby.strip_tatweel(text) text = araby.strip_shadda(text) text = araby.normalize_ligature(text) text = araby.normalize_hamza(text) text = clean_str(text) #print("After Preprocessing:"+ text) #print("----") #print(text) try: text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group() tokens = araby.tokenize(text) if not isStopword: tokens = remove_stopwords(stopwords, tokens) tokens = [t for t in tokens if t != '\n'] output.append(tokens) except: pass return output
def soundex(sentence): ''' - blanks and and spaces are deleted, - long vowels are deleted, - if two adjacent letters are identical, only one of the two is kept, - to each word’s letter are associated two numbers: 1. the first one corresponds to the letter’s main category’s code. It is represented by an integer N of two bits, such as N E={0, 1, 2}. 2. the second one corresponds to the letter’s sub-category’s code. It is represented by an integer n of four bits, such as: n SE= {0,...,10}. Thus : Given a word w, w=w 1 ...w n . w=w- {blanks and long vowels}=w’ 1 ... w’ n . f(w)=f(w’)=f(w’ 1 ... w’ n )=N 1 n 1 ... N n n n =X. The phonetic code generated X, can be used as a hash key for classifying and indexing purposes. Returns: [type] -- [description] ''' words = araby.tokenize(sentence) for word in words: if type(word) != str: word = word.decode('utf-8') else: word = word.encode('utf-8') loc = 0 for i in word[0:]: if araby.is_tatweel(i): word = araby.strip_tatweel(word) if loc < len(word) and loc != 0 and re.match( "[\u0627\u064a\u0648]", str(word[loc])): word = word[:loc] + word[loc + 1:] if loc < len(word) and re.match("[\u0640]", str(word[loc])): word = word[:loc] + word[loc + 1:] loc += 1 print(words)
for key, group in groupby(aa5irHarf)] print(freqOfAa5irHarf) import collections counter = collections.Counter(aa5irHarf) print(counter) # Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1}) print(counter.values()) # [4, 4, 2, 1, 2] print(counter.keys()) # [1, 2, 3, 4, 5] print(counter.most_common(3)) # [(1, 4), (2, 4), (3, 2)] print(counter.most_common(1)) kkey = counter.most_common(1) #we should write to file or save it anywhere #and also we should generalize it to all poems for each poet #القافية :آخر ساكن وبدور عالساكن اللي قبله مع الحرف المتحرك اللي قبل الساكن ال ما قبل الاخير print('********** Al Qafiya ************') for line in f: line1 = araby.strip_tatweel(line) letters, hrkat = araby.separate(line1) #print(letters.encode('utf8')) for m in hrkat: #لازم نعمل تعديلات if not araby.is_tatweel(m): print(araby.name(m)) print(''.join(m)) #Most Common Words بنعملهم بكل قصائد الشاعر
if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", u"سئل لأنه يؤم الإمام" ] word1=u"" for word in word_list: print (word) if araby.is_vocalized(word): print (' is vocalized') if araby.is_vocalizedtext(word): print (' is vocalized text') if araby.is_arabicword(word): print (' is valid word') else: print ("invalid arabic word") print (' strip harakat', araby.strip_harakat(word)) print (' strip tashkeel', araby.strip_tashkeel(word)) print (' strip tatweel',araby.strip_tatweel(word)) print (' normalize ligature ', araby.normalize_ligature(word)) print (' normalize hamza', araby.normalize_hamza(word)) if araby.vocalizedlike(word, word1): print ("vocalized_like") word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")
tokens.extend(araby.tokenize(line)) for token in tokens: if len(token) > 1: soundex(token) """ with io.open("nouns.masdarv2.txt", encoding="utf-8") as doc: for line in doc: temptokens.extend(araby.tokenize(line)) for token in temptokens: if len(token) >= 2 and araby.is_arabicrange(token): token = token.replace("\u0627", '') token = token.replace('\u0621', '\u0627') token = token.replace('\u0621\u0621', '\u0627') token = token.replace("\u0648", '') token = token.replace("\u064a", '') if len(token) >= 2: tokens.append( araby.strip_tashkeel(araby.strip_tatweel(token))) del temptokens for token in tokens: soundex(token[::-1]) del tokens soundex(araby.MEEM + araby.DAL + araby.KAF) #sample word, backwards because it is processed LTR soundex(araby.BEH + araby.TEH + araby.KAF) #sample word, backwards because it is processed LTR soundex(araby.FEH + araby.TEH + araby.KAF) #sample word, backwards because it is processed LTR soundex(araby.FEH + araby.NOON + araby.ALEF_HAMZA_ABOVE ) #sample word, backwards because it is processed LTR with open("db.json", 'w') as database: database.write(json.dumps(word_db))
def removeTatweel(self, t): return strip_tatweel(t)
def word_tagging(self, word_list): """ Guess word classification, into verb, noun, stopwords. return al list of guessed tags Example: >>> import naftawayh.wordtag >>> word_list=(u'بالبلاد', u'بينما', u'أو', u'انسحاب', u'انعدام', u'انفجار', u'البرنامج', u'بانفعالاتها', u'العربي', u'الصرفي', u'التطرف', u'اقتصادي', ) >>> tagger = naftawayh.wordtag.WordTagger(); >>> # test all words >>> list_tags = tagger.word_tagging(word_list) >>> for word, tag in zip(word_list, list_tags): >>> print word, tag بالبلاد n بينما vn3 أو t انسحاب n انعدام n انفجار n البرنامج n بانفعالاتها n العربي n الصرفي n التطرف n اقتصادي n @param word_list: the given word lists. @type word_list: unicode list. @return: a tag list : 't': tool, 'v': verb, 'n' :noun, 'nv' or 'vn' unidentifed. @rtype: unicode list """ if len(word_list) == 0: return [] else: list_result = [] previous = u"" second_previous = u"" # the second previous #~ previous_tag = "" for word in word_list: word_nm = araby.strip_tashkeel(word) word_nm = araby.strip_tatweel(word_nm) tag = '' if self.cache.has_key(word): tag = self.cache.get(word, '') else: if self.is_stopword(word_nm): tag = 't' else: if self.is_noun(word): tag += 'n' if self.is_verb(word): tag += 'v' # add the found tag to Cache. self.cache[word] = tag # if the tagging give an ambigous tag, # we can do an contextual analysis # the contextual tag is not saved in Cache, # because it can be ambigous. # for example # في ضرب : is a noun # قد ضرب : is a verb if tag in ("", "vn", "nv"): tag = self.context_analyse(previous, word) + "3" if tag in ("", "1", "vn1", "nv1"): tag = self.context_analyse( u" ".join([second_previous, previous]), word) + "2" list_result.append(tag) second_previous = previous previous = word_nm #~ previous_tag = tag return list_result
def _preprocess_v2(self, text: str) -> str: text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in URL_REGEXES: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in EMAIL_REGEXES: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(USER_MENTION_REGEX, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) if self.map_hindi_numbers_to_arabic: text = text.translate(HINDI_TO_ARABIC_MAP) # remove repeated characters >2 if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text, ) # insert whitespace between words and numbers or numbers and words text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text) text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text) if self.replace_slash_with_dash: text = text.replace("/", "-") # remove unwanted characters text = re.sub(self.REJECTED_CHARS_REGEX, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if (self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2"): if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text
u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print word.encode('utf8'), '\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode( 'utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like", word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u""
def strip_tatweel(text): reduced = araby.strip_tatweel(text) return reduced
def check_word(self, word, guessedtag = ""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word = araby.strip_tatweel(word) word_vocalised = word word_nm = araby.strip_tashkeel(word) # get analysed details from cache if used if self.allow_cache_use and self.cache.isAlreadyChecked(word_nm): #~ print (u"'%s'"%word).encode('utf8'), 'found' resulted_data = self.cache.getChecked(word_nm) else: resulted_data = [] # if word is a pounctuation resulted_data += self.check_word_as_pounct(word_nm) # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis resulted_data += self.check_word_as_stopword(word_nm) #if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء #~if self.tagger.has_verb_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_verb(word_nm) resulted_data += self.check_word_as_verb(word_nm) #print "is verb", rabti, len(resulted_data) #if word is noun #~if self.tagger.has_noun_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_noun(word_nm) resulted_data += self.check_word_as_noun(word_nm) if len(resulted_data) == 0: print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8') #check the word as unkonwn resulted_data += self.check_word_as_unknown(word_nm) #check if the word is nomralized and solution are equivalent resulted_data = check_normalized(word_vocalised, resulted_data) #check if the word is shadda like resulted_data = check_shadda(word_vocalised, resulted_data) # add word frequency information in tags resulted_data = self.add_word_frequency(resulted_data) # add the stemmed words details into Cache data_list_to_serialize = [w.__dict__ for w in resulted_data] if self.allow_cache_use: self.cache.addChecked(word_nm, data_list_to_serialize) #check if the word is vocalized like results if self.partial_vocalization_support: resulted_data = check_partial_vocalized(word_vocalised, resulted_data) if len(resulted_data) == 0: resulted_data.append(wordcase.WordCase({ 'word':word, 'affix': ('' , '', '', ''), 'stem':word, 'original':word, 'vocalized':word, 'semivocalized':word, 'tags':u'', 'type':'unknown', 'root':'', 'template':'', 'freq':self.wordfreq.get_freq(word, 'unknown'), 'syntax':'', }) ) return resulted_data
# remove stop word with open('clean_tweet.txt', 'r') as inFile, open('removeStopWordFile.txt', 'w') as outFile: for line in inFile.readlines(): print(" ".join([word for word in line.split() # if the word not in stop word write to file if word not in stopwords.words('arabic', 'UTF-8')]), file=outFile) inFile.close() outFile.close() with open('removeStopWordFile.txt', 'r') as f_input, open('RemoveT&T.txt', 'w') as f_output: for line in f_input: st = line #مكه become مكــــــــــــــه x = araby.strip_tatweel(st) #ٍRemove َ ً ُِ ٌ y = araby.strip_tashkeel(x) # write to file f_output.write(y) f_input.close() f_output.close() # steaming the word with open('RemoveT&T.txt', 'r') as f_input, open('steam.txt', 'w') as f_output: for line in f_input.read().split("\n"): sentence = u"" + line for word in sentence.split(" "): if word[-1:] == 'ة':
def check_word(self, word, guessedtag = ""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word = araby.strip_tatweel(word) word_vocalised = word word_nm = araby.strip_tashkeel(word) # get analysed details from cache if used if self.allow_cache_use and self.cache.isAlreadyChecked(word_nm): #~ print (u"'%s'"%word).encode('utf8'), 'found' resulted_data = self.cache.getChecked(word_nm) else: resulted_data = [] # if word is a pounctuation resulted_data += self.check_word_as_pounct(word_nm) # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis resulted_data += self.check_word_as_stopword(word_nm) #if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء #~if self.tagger.has_verb_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_verb(word_nm) resulted_data += self.check_word_as_verb(word_nm) #print "is verb", rabti, len(resulted_data) #if word is noun #~if self.tagger.has_noun_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_noun(word_nm) resulted_data += self.check_word_as_noun(word_nm) if len(resulted_data) == 0: print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8') #check the word as unkonwn resulted_data += self.check_word_as_unknown(word_nm) #check if the word is nomralized and solution are equivalent resulted_data = check_normalized(word_vocalised, resulted_data) #check if the word is shadda like resulted_data = check_shadda(word_vocalised, resulted_data) # add word frequency information in tags resulted_data = self.add_word_frequency(resulted_data) # add the stemmed words details into Cache data_list_to_serialize = [w.__dict__ for w in resulted_data] if self.allow_cache_use: self.cache.addChecked(word_nm, data_list_to_serialize) #check if the word is vocalized like results if self.partial_vocalization_support: resulted_data = check_partial_vocalized(word_vocalised, resulted_data) if len(resulted_data) == 0: resulted_data.append(wordcase.WordCase({ 'word':word, 'affix': ('' , '', '', ''), 'stem':'', 'original':word, 'vocalized':word, 'tags':u'', 'type':'unknown', 'root':'', 'template':'', 'freq':self.wordfreq.get_freq(word, 'unknown'), 'syntax':'', }) ) return resulted_data
u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print; word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like", word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1=u"" for word in word_list:
def strip_text(self, text): return araby.strip_tatweel(araby.strip_tashkeel(text))
def run_diac(gomla, dialect): sos = 'بدايةجملة' if dialect == 'ca' else 'بداية' eos = 'نهايةجملة' if dialect == 'ca' else 'نهاية' token_list_7 = LastNTokens(7, sos) fname = randint(0, 100000) with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.fmt', mode='w', encoding='utf-8') as infile: gomla = strip_tatweel(araby.normalize_ligature(gomla)) # gomla_list = araby.tokenize(gomla.replace('_', '-'), conditions=araby.is_arabicrange, morphs=araby.strip_tashkeel) gomla_list = araby.tokenize(gomla.replace('_', '-'), morphs=araby.strip_tashkeel) # gomla_list = gomla.strip().split() for token in gomla_list: t = ' '.join(token) token_list_7.add_tokens_list(t, 0) infile.write(token_list_7.get_n_tokens() + '\n') else: token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') if dialect == 'ca': ca_runner.infer( f"diacritizer/userdata/ca/{fname}.fmt", predictions_file=f"diacritizer/userdata/ca/{fname}.rlt", checkpoint_path=None, log_time=False) elif dialect == 'msa': msa_runner.infer( f"diacritizer/userdata/msa/{fname}.fmt", predictions_file=f"diacritizer/userdata/msa/{fname}.rlt", checkpoint_path=None, log_time=False) elif dialect == 'tun': tn_runner.infer( f"diacritizer/userdata/tun/{fname}.fmt", predictions_file=f"diacritizer/userdata/tun/{fname}.rlt", checkpoint_path=None, log_time=False) elif dialect == 'mor': ma_runner.infer( f"diacritizer/userdata/mor/{fname}.fmt", predictions_file=f"diacritizer/userdata/mor/{fname}.rlt", checkpoint_path=None, log_time=False) with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.rlt', mode='r', encoding='utf-8') as outfile: diacritized_tokens = list() counters = defaultdict(Counter) for i, line in enumerate(outfile): dtokens = line.strip().split(' _ ') # print(len(dtokens), dtokens) for j, _ in enumerate(dtokens): tk = dtokens[j - 1 - i % 7] if tk not in [eos, sos]: counters[j].update([tk]) if sum(counters[j].values()) >= 7: diacritized_tokens.append( counters[j].most_common(1)[0][0].replace(' ', '')) counters[j].clear() else: return ' '.join(diacritized_tokens)
def preprocess(self, text): """ Preprocess takes an input text line an applies the same preprocessing used in AraBERT pretraining Args: text (:obj:`str`): inout text string Returns: string: A preprocessed string depending on which model was selected """ if self.model_name == "bert-base-arabert": return self._old_preprocess( text, do_farasa_tokenization=True, ) if self.model_name == "bert-base-arabertv01": return self._old_preprocess(text, do_farasa_tokenization=False) text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in url_regexes: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in email_regexes: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(user_mention_regex, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) # remove repeated characters >2 if self.remove_elongation: text = self._remove_elongation(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text, ) # insert whitespace between words and numbers or numbers and words text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text) text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text) # remove unwanted characters if self.keep_emojis: emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex) text = re.sub(rejected_chars_regex2, " ", text) else: text = re.sub(rejected_chars_regex, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if (self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2"): if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text
def from_url(url): """ Initialize a Question (written or oral) from its public URL e.g.: https://www.chambredesrepresentants.ma/ar/%D9%85%D8%B1%D8%A7%D9%82%D8%A8%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D9%84-%D8%A7%D9%84%D8%AD%D9%83%D9%88%D9%85%D9%8A/%D8%A7%D9%84%D8%A3%D8%B3%D9%80%D8%A6%D9%84%D8%A9-%D8%A7%D9%84%D9%83%D8%AA%D8%A7%D8%A8%D9%8A%D8%A9/%D8%A7%D9%86%D8%AA%D8%B4%D8%A7%D8%B1-%D8%A7%D9%84%D8%AD%D8%B4%D8%B1%D8%A9-%D8%A7%D9%84%D9%82%D8%B1%D9%85%D8%B2%D9%8A%D8%A9-%D8%A8%D9%86%D8%A8%D8%A7%D8%AA-%D8%A7%D9%84%D8%B5%D8%A8%D8%A7%D8%B1 Note: only the arabic version is currently supported.""" r = requests.get(url) s = BeautifulSoup(r.text, 'html.parser') try: # Note : strip_tatweel below is mandatory, it took me several hours of scratching title = araby.strip_tatweel( s.find_all( 'h1', class_='section-title title-global lang_ar')[0].text) title.lstrip().rstrip() qtype = 'undefined' if arabic_string_eq(title, "الأسئلة الكتابية"): qtype = 'written' elif arabic_string_eq(title, "الأسئلة الشفوية"): qtype = 'oral' spans = s.find_all(class_='q-b1-1')[0].find_all('span') status = 'unknown' if 'q-st-red' in spans[4]['class']: status = 'unanswered' elif 'q-st-green' in spans[4]['class']: status = 'answered' if status == 'unanswered': content = s.find_all(class_='q-block1-na')[0] elif status == 'answered': content = s.find_all(class_='q-block1')[0] else: raise ValueError("Malformed HTML") qb11 = content.find_all(class_='q-b1-1')[0].find_all('span') # XXX How do we handle RE faulty cases ? p = re.compile('رقم السؤال : ([0-9]+)') res = p.match(qb11[0].text.lstrip().rstrip()) id = int(res.group(1)) p = re.compile('الموضوع : (.*)') res = p.match(qb11[1].text.lstrip().rstrip()) topic = res.group(1) answer_date_raw = '' if arabic_string_eq(qb11[2].text.split(':')[0].lstrip().rstrip(), 'تاريخ الجواب'): answer_date_raw = qb11[3].text.lstrip().rstrip() answer_date = format_raw_date_to_isoformat(answer_date_raw) qb12 = content.find_all(class_='q-b1-2 row')[0] team = qb12.find_all(class_='col-md-5')[0].find_all('a')[0].text questioners = [] for q in qb12.find_all(class_='col-md-7')[0].find_all( class_='q-b1-2-s-item'): questioners.append( q.find_all( class_='q-name')[0].find('a').text.lstrip().rstrip()) qb13 = content.find_all(class_='q-b1-3')[0] offset = 0 designated_ministry = '' # XXX : are there cases where there are multiple ministries involved? if arabic_string_eq( qb13.find_all('div')[0].text.split(':') [0].lstrip().rstrip(), 'الوزارة المختصة'): designated_ministry = qb13.find_all('div')[0].text.split( ':')[1].replace('\n', '').lstrip().rstrip() offset = 1 question_date_raw = '' question_text = '' if qtype == "written": if arabic_string_eq( qb13.find_all('div')[offset].text.split(':') [0].lstrip().rstrip(), 'تاريخ السؤال'): question_date_raw = qb13.find_all( 'div')[offset].text.split(':')[1].replace( '\n', '').lstrip().rstrip() if arabic_string_eq( qb13.find_all('div')[offset + 1].text.split(':') [0].lstrip().rstrip(), 'السؤال'): question_text = qb13.find_all('div')[offset + 1].find( 'p').text.lstrip().rstrip() elif qtype == "oral": # TODO: check if there are cases where the date is given if arabic_string_eq( qb13.find_all('div')[offset].text.split(':') [0].lstrip().rstrip(), 'السؤال'): question_text = qb13.find_all('div')[offset].find( 'p').text.lstrip().rstrip() question_date = format_raw_date_to_isoformat(question_date_raw) answer_doc = '' answer_content = s.find_all(class_='q-block2') if answer_content != []: answer_doc = answer_content[0].find_all('a', href=True)[0]['href'] q = Question(id, url, qtype, topic, question_date, question_text, questioners, status, answer_date, answer_doc, designated_ministry) return q except Exception as e: print('There was an exception: %s' % e) return None
u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ") if araby.is_vocalized(word): print(' is vocalized', end=" ") if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ") if araby.is_arabicword(word): print(' is valid word', end=" ") else: print("invalid arabic word", end=" ") print(' strip harakat', araby.strip_harakat(word), end=" ") print(' strip tashkeel', araby.strip_tashkeel(word), end=" ") print(' strip tatweel', araby.strip_tatweel(word), end=" ") print(' normalize ligature ', araby.normalize_ligature(word), end=" ") if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ") print() word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print("vocalized_like", end=" ") word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u""
def strip_text(text): # removes arabic vowels and stylistic spacing return araby.strip_tatweel(araby.strip_tashkeel(text))