def test_seperate(self): """Test Separate function ?""" letters = u"العربية" marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f' word = u"اَلْعَرَبَيَةُ" l, m= ar.separate(word) self.assertEqual(ar.joint(l,m), word) self.assertEqual(ar.separate(ar.joint(letters,marks)), (letters,marks))
def test_seperate(self): """Test Separate function ?""" letters = u"العربية" marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f' word = u"اَلْعَرَبَيَةُ" l, m= ar.separate(word) self.assertEqual(ar.joint(l,m), word) self.assertEqual(ar.separate(ar.joint(letters,marks)), (letters,marks))
def convert_to_two_hot(word): if (len(word) == 1 and word[0] in diacritics): letters_hot_vector = np.zeros((1, 37)) diacritics_hot_vector = convert_word_to_one_hot(word, is_letter=False) else: without_diacritics, only_diacritics = separate(word) only_diacritics = only_diacritics.replace("ـ", "ْ") letters_hot_vector = convert_word_to_one_hot(without_diacritics, is_letter=True) diacritics_hot_vector = convert_word_to_one_hot(only_diacritics, is_letter=False) # print(diacritics_hot_vector.shape) # print(letters_hot_vector.shape) return np.concatenate([letters_hot_vector, diacritics_hot_vector], axis=1)
def encode_tashkeel(word, method="ascii"): """ Encode word marks into decimal or Ascii string to be saved as integer Example: >>> import pyarabic.trans >>> word1 = u"هَارِبًا" >>> pyarabic.trans.encode_tashkeel(word1) ('هاربا', 'a0iA0') >>> pyarabic.trans.encode_tashkeel(word1, "decimal") ('هاربا', 40610) >>> letters = u"هاربا" >>> encoded_marks = u"a0iA0" >>> pyarabic.trans.decode_tashkeel(letters, encoded_marks) 'هَارِبًا' >>> letters = u"هاربا" >>> encoded_marks = 40610 >>> pyarabic.trans.decode_tashkeel(letters, encoded_marks, "decimal") 'هَارِبًا' @input word: diacritized arabic diacritcs @type word: unicode @return: (letters, encoded) zero if fails @rtype: (letters, encoded) ttring/ integer """ letters, marks = ar.separate(word) if method == "decimal": transed = translate(marks, T2D_TRANS) elif method == "ascii": transed = translate(marks, T2A_TRANS) else: transed = translate(marks, T2A_TRANS) if method == "decimal": try: transed = int(transed) except: return word, "" return letters, transed
def prepare_dataset(self): data = {} counter = 0 with open(self.file, encoding='utf8') as file: for line in file: letter_ids = [] diacritic_ids = [] word_ids = [] letters, diacritics = araby.separate(line) letters = letters[0:-1] words = araby.tokenize(line)[0:-1] diacritics = diacritics[0:-1] for letter in letters: if (letter == '\n') or (letter == '\u200f'): continue letter_ids.append(self.letter_to_id[letter]) for index, diacritic in enumerate(diacritics): if letters[index] == " ": diacritic_ids.append(self.diacritic_to_id['space']) else: diacritic_ids.append(self.diacritic_to_id[diacritic]) for word in words: word_ids.append( self.word_to_id[araby.strip_tashkeel(word)]) instance = (torch.tensor(letter_ids, dtype=torch.long, requires_grad=False), torch.tensor(diacritic_ids, dtype=torch.long, requires_grad=False), torch.tensor(word_ids, dtype=torch.long, requires_grad=False)) data[counter] = instance counter += 1 return data
def encode_tashkeel(word, method="ascii"): """ encode word marks into decimal string to be saved as integer @input word: diacritized arabic diacritcs @type word: unicode @return: (letters, encoded) zero if fails @rtype: (letters, encoded) ttring/ integer """ letters, marks = ar.separate(word) if method == "decimal": transed = translate(marks, T2D_TRANS) elif method == "ascii": transed = translate(marks, T2A_TRANS) else: transed = translate(marks, T2A_TRANS) if method == "decimal": try: transed = int(transed) except: return word, "" return letters, transed
def verify_tashkeel(word): """ verify tashkeel on vocalized word""" letters, marks = ar.separate(word) new_word = ar.joint(letters, marks) return new_word == word
for ln in statTable.keys(): partialPatternCount = 0 for patternkey in statTable[ln].keys(): partialPatternCount += len(statTable[ln][patternkey].keys()) if statTable[ln].keys(): average = partialPatternCount / len(statTable[ln].keys()) else: average = 0 print "\t".join([str(ln), str(len(statTable[ln].keys())), str(average), "pw"]) # test vocalize a word text = u"يأكل الولد التفاح بالعشاء " words = araby.tokenize(text) for word in words: patternKey = harakatpattern.extractPattern(word) ln = len(patternKey) if statTable.has_key(ln) and statTable[ln].has_key(patternKey): print u"\t".join(statTable[ln][patternKey].keys()).encode("utf8") for vocalizedPattern in statTable[ln][patternKey].keys(): # vocalizedPattern2=araby.stripShadda(vocalizedPattern) # letters,harakat = araby.separate(vocalizedPattern2) # vocalizedForm =araby.joint(word,harakat) letters, harakat, ShaddaPlaces = araby.separate(vocalizedPattern, True) newWord_nm = araby.joint(word, ShaddaPlaces) vocWord = araby.joint(newWord_nm, harakat) print u"\t".join([word, patternKey, vocalizedPattern, harakat, vocWord]).encode("utf8") else: print patternKey.encode("utf8"), "pattern non found" # print wordCount/patternCount;
for key, group in groupby(aa5irHarf)] print(freqOfAa5irHarf) import collections counter = collections.Counter(aa5irHarf) print(counter) # Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1}) print(counter.values()) # [4, 4, 2, 1, 2] print(counter.keys()) # [1, 2, 3, 4, 5] print(counter.most_common(3)) # [(1, 4), (2, 4), (3, 2)] print(counter.most_common(1)) kkey = counter.most_common(1) #we should write to file or save it anywhere #and also we should generalize it to all poems for each poet #القافية :آخر ساكن وبدور عالساكن اللي قبله مع الحرف المتحرك اللي قبل الساكن ال ما قبل الاخير print('********** Al Qafiya ************') for line in f: line1 = araby.strip_tatweel(line) letters, hrkat = araby.separate(line1) #print(letters.encode('utf8')) for m in hrkat: #لازم نعمل تعديلات if not araby.is_tatweel(m): print(araby.name(m)) print(''.join(m)) #Most Common Words بنعملهم بكل قصائد الشاعر
average = partialPatternCount / len(statTable[ln].keys()) else: average = 0 print "\t".join( [str(ln), str(len(statTable[ln].keys())), str(average), 'pw']) # test vocalize a word text = u"يأكل الولد التفاح بالعشاء " words = araby.tokenize(text) for word in words: patternKey = harakatpattern.extractPattern(word) ln = len(patternKey) if statTable.has_key(ln) and statTable[ln].has_key(patternKey): print u"\t".join(statTable[ln][patternKey].keys()).encode('utf8') for vocalizedPattern in statTable[ln][patternKey].keys(): # vocalizedPattern2=araby.stripShadda(vocalizedPattern) # letters,harakat = araby.separate(vocalizedPattern2) # vocalizedForm =araby.joint(word,harakat) letters, harakat, ShaddaPlaces = araby.separate( vocalizedPattern, True) newWord_nm = araby.joint(word, ShaddaPlaces) vocWord = araby.joint(newWord_nm, harakat) print u"\t".join( [word, patternKey, vocalizedPattern, harakat, vocWord]).encode('utf8') else: print patternKey.encode('utf8'), "pattern non found" # print wordCount/patternCount;
def get_tashkeel_binary(ayah): ''' get_tashkeel_pattern is function takes the str or list(ayah or token) and converts to zero and ones What it does: take token whether ayah or sub ayah and maps it to zero for sukoon and char without diarictics and one for char with harakat and tanwin Args: param1 (str): a string or list Returns: str : zero and ones for each token ''' marksDictionary = { 'ْ': 0, '': 0, 'ُ': 1, 'َ': 1, 'ِ': 1, 'ّ': 1, 'ٌ': 1, 'ً': 1, 'ٍ': 1 } charWithOutTashkeelOrSukun = '' tashkeelPatternList = [] # list of zeros and ones marksList = [] # convert the List o to string without spaces ayahModified = ''.join(ayah.strip()) tashkeelPatternStringWithSpace = '' # check is there a tatweel in ayah or not if (tatweel in ayahModified): ayahModified = strip_tatweel(ayahModified) # check whether exist alef_mad in ayah if exist unpack the alef mad if (alef_mad in ayahModified): ayahModified = unpack_alef_mad(ayahModified) # separate tashkeel from the ayah ayahOrAyatWithoutTashkeel, marks = separate(ayahModified) for mark in marks: #the pyarabic returns the char of marks without tashkeel with 'ـ' so if check about this mark if not exist #append in list harakat and zero or ones in tashkeel pattern list if yes append the marks and patterns if (mark != 'ـ'): marksList.append(mark) tashkeelPatternList.append(marksDictionary[mark]) else: marksList.append(charWithOutTashkeelOrSukun) tashkeelPatternList.append( marksDictionary[charWithOutTashkeelOrSukun]) # convert list of Tashkeel pattern to String for each token in ayah separate with another token with spce for posOfCharInAyah in range(0, len(ayahOrAyatWithoutTashkeel)): if ayahOrAyatWithoutTashkeel[ posOfCharInAyah] == ' ' and tashkeelPatternList[ posOfCharInAyah] == 0: tashkeelPatternStringWithSpace += ' ' else: tashkeelPatternStringWithSpace += str( tashkeelPatternList[posOfCharInAyah]) return tashkeelPatternStringWithSpace, marksList
def prepare_dataset(self): data = {} counter = 0 with open(self.file, encoding='utf-8') as file: for line in file: letter_ids = [] diacritic_ids = [] word_ids_nodiacs = [] word_ids_diacs = [] letters, diacritics = araby.separate(line) letters = letters[0:-1] words = araby.tokenize(line)[0:-1] diacritics = diacritics[0:-1] diacritic_ids_nosh = [] index = 0 shaddahs = [] for letter in letters: if (letter == '\n') or (letter == '\u200f'): continue if (letter == 'ّ'): if diacritics[index] != 'ـ': diacritic_ids[-1] = self.diacritic_to_id[ letter + diacritics[index]] else: diacritic_ids[-1] = self.diacritic_to_id[letter] diacritic_ids_nosh[-1] = self.diacritic_to_id_nosh[ diacritics[index]] else: letter_ids.append(self.letter_to_id[letter]) if letter == " ": diacritic_ids.append(self.diacritic_to_id['space']) diacritic_ids_nosh.append( self.diacritic_to_id_nosh['space']) else: diacritic_ids.append( self.diacritic_to_id[diacritics[index]]) diacritic_ids_nosh.append( self.diacritic_to_id_nosh[diacritics[index]]) index += 1 for diacritic_id in diacritic_ids: if 'ّ' in self.id_to_diacritic[diacritic_id]: shaddahs.append(1) else: shaddahs.append(0) for word in words: word_ids_diacs.append(self.word_to_id_diacs[word]) word_ids_nodiacs.append( self.word_to_id_nodiacs[araby.strip_tashkeel(word)]) instance = (torch.tensor(letter_ids, dtype=torch.long, requires_grad=False), torch.tensor(diacritic_ids, dtype=torch.long, requires_grad=False), torch.tensor(diacritic_ids_nosh, dtype=torch.long, requires_grad=False), torch.tensor(shaddahs, dtype=torch.long, requires_grad=False), torch.tensor(word_ids_diacs, dtype=torch.long, requires_grad=False), torch.tensor(word_ids_nodiacs, dtype=torch.long, requires_grad=False)) data[counter] = instance counter += 1 return data