コード例 #1
0
def translit_string(script_src, script_trg, line, perm):

    if script_src == "latin" and script_trg == "ar":
        new_line = buckwalter.untransliterate(line)
    elif script_trg == "latin" and script_src == "ar":
        new_line = buckwalter.transliterate(line)
    # ka is Georgian
    elif script_src == "ka" and script_trg == "latin":
        new_line = translit(line, script_src, reversed=True)
    elif script_src == "ru" and script_trg == "latin":
        new_line = translit(line, script_src, reversed=True)
    elif script_trg == "latin" and script_src == "el":
        new_line = translit(line, script_src, reversed=True)
    elif script_trg == "ru" and script_src == "latin":
        new_line = translit(line, script_trg)
    elif script_trg == "latin" and script_src == "ja":
        kks = pykakasi.kakasi()

        new_line = " ".join([word["hepburn"] for word in kks.convert(line)])
    elif script_trg == "permute":
        #assert script_src == "latin"
        assert perm is not None
        line = list(line)
        new_line = "".join([perm.get(c, c) for c in line])
    else:
        raise (Exception(f"not supported {script_src}-->{script_trg}"))
    return new_line
コード例 #2
0
def buckwalter_to_arabic(buckwalter_sentence):
    return buckwalter.untransliterate(buckwalter_sentence)
コード例 #3
0
     u"\U00002702-\U000027B0"
     u"\U000024C2-\U0001F251"
     u"\U0001f926-\U0001f937"
     u'\U00010000-\U0010ffff'
     u"\u200d"
     u"\u2640-\u2642"
     u"\u2600-\u2B55"
     u"\u23cf"
     u"\u23e9"
     u"\u231a"
     u"\u3030"
     u"\ufe0f"
     "]+",
     flags=re.UNICODE)
 ligne = re.sub(emoji_pattern, '', ligne)
 ligne = buckwalter.untransliterate(ligne)  # translittération du L'arabizi
 ligne = re.sub(
     r"(http|https|ftp)://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
     "", ligne)  # supprimer les URL
 ligne = re.sub(r"[أ-ي]#", "", ligne)
 ligne = re.sub(r"@[^\s]+", "",
                ligne)  # suprimer les hashtags et les noms d'utilisateur
 ligne = re.sub(r"#[أ-ي]+", "", ligne)
 ligne = re.sub('[%s]' % re.escape(string.punctuation), '', ligne)
 ligne = re.sub(
     r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+",
     '', ligne)  # supprimer les nombres
 ligne = re.sub(
     r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
     '', ligne)  # supprimer les mots non-arabe
 ligne = re.sub(r'\s+', ' ', ligne)  # Supprimer les doubles espaces
コード例 #4
0
    def post(self):
        # STEP 1: GET POSTED DATA AND CHECK THE DATA
        postedData = request.get_json()
        statusCode = checkPostedData(postedData, "FindSurah")

        if (statusCode != 200):
            response = {
                "responseMessage": "An Error Happened",
                "responseCode": statusCode
            }
            return jsonify(response)
        else:
            surahNumber = postedData["surahNumber"]
            surahNumber = str(surahNumber)
            corpusCondition = ".*\\(" + surahNumber + "\\:.*"

            # surahFromDB = corpusAQ.find({"_id": {"$regex": ".*\\(112\\:.*"}})
            surahFromDB = corpusAQ.find({"_id": {"$regex": corpusCondition}})
            lastWord = "0"
            currentWord = "0"
            surah = ""
            subWordSurah = corpusAQ.find({
                "_id": {
                    "$regex": corpusCondition
                }
            }).count()
            currentAyat = "0"
            totalAyat = 0
            lastAyat = "0"

            for doc in surahFromDB:
                splittedLocation = str(doc["_id"]).split(":")
                currentWord = splittedLocation[2]
                if (currentWord != lastWord):
                    if (lastWord == 0):
                        surah = surah + str(doc["buckwalter"])
                        lastWord = currentWord
                    else:
                        surah = surah + " " + str(doc["buckwalter"])
                        lastWord = currentWord
                else:
                    surah = surah + str(doc["buckwalter"])

                currentAyat = splittedLocation[1]
                if (currentAyat != lastAyat):
                    totalAyat = totalAyat + 1
                    lastAyat = currentAyat

            arabicSurah = buckwalter.untransliterate(surah)

            meaningFromDB = wordbyword.find({"suratnumber": int(surahNumber)})
            meaningResponse = ""
            for doc in meaningFromDB:
                if meaningResponse == "":
                    meaningResponse = meaningResponse + str(doc["translation"])
                else:
                    meaningResponse = meaningResponse + " " + str(
                        doc["translation"])

            response = {
                "responseCode": 200,
                "responseMessageArabic": arabicSurah,
                "responseMessageMeaning": meaningResponse,
                "responseMessageSubWordSurah": subWordSurah,
                "totalAyat": totalAyat
            }
            return jsonify(response)
コード例 #5
0
ファイル: padic.py プロジェクト: iAhmedMaher/arabic-did
ensure_dir(os.path.join(padic_dir, 'eval'))
ensure_dir(os.path.join(padic_dir, 'train'))
ensure_dir(os.path.join(padic_dir, 'test'))

tree = ET.parse(os.path.join(padic_dir, 'PADIC.xml'))

root = tree.getroot()

padic_list = []
node_label_pairs = [('MOROCCAN', 'MOR'), ('ANNABA', 'ANN'),
                    ('MODERN-STANDARD-ARABIC', 'MSA'), ('SYRIAN', 'SYR'),
                    ('PALESTINIAN', 'PAL'), ('ALGIERS', 'ALG')]
for sentence in root:
    for node, label in node_label_pairs:
        padic_list += [
            label,
            buckwalter.untransliterate(sentence.find(node).text[3:])
        ]

df = pd.DataFrame(np.array(padic_list).reshape(-1, 2),
                  columns=['label', 'text'])
df.index.name = 'id'

# SUG: random sampling is better
train = df.iloc[:25968, :]
evaluate = df.iloc[25968:25968 + 8652, :]
test = df.iloc[25968 + 8652:25968 + 8652 + 8652, :]

train.to_csv(os.path.join(padic_dir, 'train', 'train.csv'))
evaluate.to_csv(os.path.join(padic_dir, 'eval', 'eval.csv'))
test.to_csv(os.path.join(padic_dir, 'test', 'test.csv'))
コード例 #6
0
from lang_trans.arabic import buckwalter

# start timer
startTime = time.time()

# read csv files and convert it to dataframe
print('Read CSV Files Start')
corpusAqDataFrame = pd.read_csv('corpusaq-full.csv',
                                delimiter=',',
                                index_col=None)
print('Read CSV Files Finish')

# add an arabic translation with buckwalter to the dataframe
print('Adding Arabic To Dataframe Start')
for index, row in corpusAqDataFrame.iterrows():
    corpusAqDataFrame.loc[index, "arabic"] = buckwalter.untransliterate(
        corpusAqDataFrame.loc[index, "buckwalter"])
    print('location ' + corpusAqDataFrame.loc[index, 'location'] + ' done')
print('Adding Arabic To Dataframe Finish')

# write complete dataframe to csv
print('Write Dataframe to CSV Start')
corpusAqDataFrame.to_csv('corpusaqWithArabic.csv', index=False)
print('Write Dataframe to CSV Finish')

# stop timer print execution time
print("Execution Time : %s Seconds" % (time.time() - startTime))

# # testing arabic csv
# corpusAqWithArabicDataFrame = pd.read_csv('corpusaq_plus_arabic.csv', delimiter=',')
# print(corpusAqWithArabicDataFrame.iloc[0])
# print(buckwalter.trans(corpusAqWithArabicDataFrame.iloc[0].loc['arabic']))