def translit_string(script_src, script_trg, line, perm): if script_src == "latin" and script_trg == "ar": new_line = buckwalter.untransliterate(line) elif script_trg == "latin" and script_src == "ar": new_line = buckwalter.transliterate(line) # ka is Georgian elif script_src == "ka" and script_trg == "latin": new_line = translit(line, script_src, reversed=True) elif script_src == "ru" and script_trg == "latin": new_line = translit(line, script_src, reversed=True) elif script_trg == "latin" and script_src == "el": new_line = translit(line, script_src, reversed=True) elif script_trg == "ru" and script_src == "latin": new_line = translit(line, script_trg) elif script_trg == "latin" and script_src == "ja": kks = pykakasi.kakasi() new_line = " ".join([word["hepburn"] for word in kks.convert(line)]) elif script_trg == "permute": #assert script_src == "latin" assert perm is not None line = list(line) new_line = "".join([perm.get(c, c) for c in line]) else: raise (Exception(f"not supported {script_src}-->{script_trg}")) return new_line
def buckwalter_to_arabic(buckwalter_sentence): return buckwalter.untransliterate(buckwalter_sentence)
u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" "]+", flags=re.UNICODE) ligne = re.sub(emoji_pattern, '', ligne) ligne = buckwalter.untransliterate(ligne) # translittération du L'arabizi ligne = re.sub( r"(http|https|ftp)://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", ligne) # supprimer les URL ligne = re.sub(r"[أ-ي]#", "", ligne) ligne = re.sub(r"@[^\s]+", "", ligne) # suprimer les hashtags et les noms d'utilisateur ligne = re.sub(r"#[أ-ي]+", "", ligne) ligne = re.sub('[%s]' % re.escape(string.punctuation), '', ligne) ligne = re.sub( r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+", '', ligne) # supprimer les nombres ligne = re.sub( r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]', '', ligne) # supprimer les mots non-arabe ligne = re.sub(r'\s+', ' ', ligne) # Supprimer les doubles espaces
def post(self): # STEP 1: GET POSTED DATA AND CHECK THE DATA postedData = request.get_json() statusCode = checkPostedData(postedData, "FindSurah") if (statusCode != 200): response = { "responseMessage": "An Error Happened", "responseCode": statusCode } return jsonify(response) else: surahNumber = postedData["surahNumber"] surahNumber = str(surahNumber) corpusCondition = ".*\\(" + surahNumber + "\\:.*" # surahFromDB = corpusAQ.find({"_id": {"$regex": ".*\\(112\\:.*"}}) surahFromDB = corpusAQ.find({"_id": {"$regex": corpusCondition}}) lastWord = "0" currentWord = "0" surah = "" subWordSurah = corpusAQ.find({ "_id": { "$regex": corpusCondition } }).count() currentAyat = "0" totalAyat = 0 lastAyat = "0" for doc in surahFromDB: splittedLocation = str(doc["_id"]).split(":") currentWord = splittedLocation[2] if (currentWord != lastWord): if (lastWord == 0): surah = surah + str(doc["buckwalter"]) lastWord = currentWord else: surah = surah + " " + str(doc["buckwalter"]) lastWord = currentWord else: surah = surah + str(doc["buckwalter"]) currentAyat = splittedLocation[1] if (currentAyat != lastAyat): totalAyat = totalAyat + 1 lastAyat = currentAyat arabicSurah = buckwalter.untransliterate(surah) meaningFromDB = wordbyword.find({"suratnumber": int(surahNumber)}) meaningResponse = "" for doc in meaningFromDB: if meaningResponse == "": meaningResponse = meaningResponse + str(doc["translation"]) else: meaningResponse = meaningResponse + " " + str( doc["translation"]) response = { "responseCode": 200, "responseMessageArabic": arabicSurah, "responseMessageMeaning": meaningResponse, "responseMessageSubWordSurah": subWordSurah, "totalAyat": totalAyat } return jsonify(response)
ensure_dir(os.path.join(padic_dir, 'eval')) ensure_dir(os.path.join(padic_dir, 'train')) ensure_dir(os.path.join(padic_dir, 'test')) tree = ET.parse(os.path.join(padic_dir, 'PADIC.xml')) root = tree.getroot() padic_list = [] node_label_pairs = [('MOROCCAN', 'MOR'), ('ANNABA', 'ANN'), ('MODERN-STANDARD-ARABIC', 'MSA'), ('SYRIAN', 'SYR'), ('PALESTINIAN', 'PAL'), ('ALGIERS', 'ALG')] for sentence in root: for node, label in node_label_pairs: padic_list += [ label, buckwalter.untransliterate(sentence.find(node).text[3:]) ] df = pd.DataFrame(np.array(padic_list).reshape(-1, 2), columns=['label', 'text']) df.index.name = 'id' # SUG: random sampling is better train = df.iloc[:25968, :] evaluate = df.iloc[25968:25968 + 8652, :] test = df.iloc[25968 + 8652:25968 + 8652 + 8652, :] train.to_csv(os.path.join(padic_dir, 'train', 'train.csv')) evaluate.to_csv(os.path.join(padic_dir, 'eval', 'eval.csv')) test.to_csv(os.path.join(padic_dir, 'test', 'test.csv'))
from lang_trans.arabic import buckwalter # start timer startTime = time.time() # read csv files and convert it to dataframe print('Read CSV Files Start') corpusAqDataFrame = pd.read_csv('corpusaq-full.csv', delimiter=',', index_col=None) print('Read CSV Files Finish') # add an arabic translation with buckwalter to the dataframe print('Adding Arabic To Dataframe Start') for index, row in corpusAqDataFrame.iterrows(): corpusAqDataFrame.loc[index, "arabic"] = buckwalter.untransliterate( corpusAqDataFrame.loc[index, "buckwalter"]) print('location ' + corpusAqDataFrame.loc[index, 'location'] + ' done') print('Adding Arabic To Dataframe Finish') # write complete dataframe to csv print('Write Dataframe to CSV Start') corpusAqDataFrame.to_csv('corpusaqWithArabic.csv', index=False) print('Write Dataframe to CSV Finish') # stop timer print execution time print("Execution Time : %s Seconds" % (time.time() - startTime)) # # testing arabic csv # corpusAqWithArabicDataFrame = pd.read_csv('corpusaq_plus_arabic.csv', delimiter=',') # print(corpusAqWithArabicDataFrame.iloc[0]) # print(buckwalter.trans(corpusAqWithArabicDataFrame.iloc[0].loc['arabic']))