def aya_tokens(aya): parts = simple_aya(aya['text']).replace(' ', ' ').split(' ') raw_ayas = aya['raw'].split(' ') normalize_token = lambda s: s.replace('آ', 'ا').replace('ء', '').replace( 'ئ', '').replace('أ', 'ا').replace('إ', 'ا').replace('ؤ', 'و') tokens = [{ 'word': word, 'stem': isri.stem(word), 'id': parts.index(word) + 1 } for word in raw_ayas if word in parts] not_found_words = [word for word in raw_ayas if word not in parts] not_found_parts = [part for part in parts if not part in raw_ayas] start = -1 for word in not_found_words: for part in not_found_parts: if not_found_parts.index(part) > start: if normalize_token(word).replace( 'ا', '') == normalize_token(part).replace( 'ا', '') or normalize_token(word).replace( 'و', 'ا') == normalize_token(part).replace( 'و', 'ا') or normalize_token(word).replace( 'ی', 'ا') == normalize_token(part).replace( 'ی', 'ا'): found_ids = [token['id'] for token in tokens] k = found_ids[found_ids.index( parts.index(part) + 1)] if parts.index(part) + 1 in found_ids else 0 tokens.append({ 'word': word, 'stem': isri.stem(word), 'id': parts.index(part, k) + 1 }) start = not_found_parts.index(part) break for i in range(len(tokens)): for j in range(i + 1, len(tokens)): if tokens[i]['id'] == tokens[j]['id']: try: tokens[j]['id'] = parts.index(tokens[j]['word'], tokens[i]['id']) + 1 except ValueError as e: print(str(e)) return tokens
def refine_section(section): # ayas for item in section.find(".aya").items(): text = simple_aya(item.text()) if text.startswith("(") and text.startswith("("): text = text[1:-1] item.text(text) # structure refine_translation(section) for item in section.children().items(): if item[0].tag == "p": if len(item.text().strip()) <= 1: item.remove() else: if len(item.find(".trans")) >= 1: for span in section.find(".trans").items(): item.append(span.outerHtml()) span.remove()
def refine_section(section): # ayas for item in section.find('.aya').items(): text = simple_aya(item.text()) if text.startswith('(') and text.startswith('('): text = text[1:-1] item.text(text) # structure refine_translation(section) for item in section.children().items(): if item[0].tag == 'p': if len(item.text().strip()) <= 1: item.remove() else: if len(item.find('.trans')) >= 1: for span in section.find('.trans').items(): item.append(span.outerHtml()) span.remove()
def aya_tokens(aya): parts = simple_aya(aya["text"]).replace(" ", " ").split(" ") raw_ayas = aya["raw"].split(" ") normalize_token = ( lambda s: s.replace("آ", "ا") .replace("ء", "") .replace("ئ", "") .replace("أ", "ا") .replace("إ", "ا") .replace("ؤ", "و") ) tokens = [ {"word": word, "stem": isri.stem(word), "id": parts.index(word) + 1} for word in raw_ayas if word in parts ] not_found_words = [word for word in raw_ayas if word not in parts] not_found_parts = [part for part in parts if not part in raw_ayas] start = -1 for word in not_found_words: for part in not_found_parts: if not_found_parts.index(part) > start: if ( normalize_token(word).replace("ا", "") == normalize_token(part).replace("ا", "") or normalize_token(word).replace("و", "ا") == normalize_token(part).replace("و", "ا") or normalize_token(word).replace("ی", "ا") == normalize_token(part).replace("ی", "ا") ): found_ids = [token["id"] for token in tokens] k = found_ids[found_ids.index(parts.index(part) + 1)] if parts.index(part) + 1 in found_ids else 0 tokens.append({"word": word, "stem": isri.stem(word), "id": parts.index(part, k) + 1}) start = not_found_parts.index(part) break for i in range(len(tokens)): for j in range(i + 1, len(tokens)): if tokens[i]["id"] == tokens[j]["id"]: try: tokens[j]["id"] = parts.index(tokens[j]["word"], tokens[i]["id"]) + 1 except ValueError as e: print(str(e)) return tokens
def resolve_phrase(phrase, tokens, book): phrase = simple_aya(phrase.strip()).replace("", "").replace("ّ", "") if len(phrase) < 3: return None normalize_Alif_lam = lambda s: s[2:] if (s[:2] == "ال") else s normalize_arabic_letter = lambda s: s.replace("ة", "ه").replace("ؤ", "و").replace("إ", "ا").replace("أ", "ا") normalize_LBKF = lambda s: s[1:] if (s[:1] in "لبکف") else s matchings = [ lambda token: phrase == token["word"], # exact lambda token: normalize_arabic_letter(phrase) == normalize_arabic_letter(token["word"]), # without arabic letters lambda token: normalize_Alif_lam(phrase) == normalize_Alif_lam(token["word"]), # without Alif-lam lambda token: normalize_arabic_letter(normalize_Alif_lam(phrase)) == normalize_arabic_letter(normalize_Alif_lam(token["word"])), # without arabic letters and Alif-lam lambda token: normalize_arabic_letter(normalize_LBKF(phrase)) == normalize_arabic_letter(normalize_LBKF(token["word"])), lambda token: isri.stem(phrase) == token["stem"], # stemed ] matchings2 = [ lambda token, i: phrase.split()[i] == token["word"], # exact lambda token, i: normalize_arabic_letter(phrase.split()[i]) == normalize_arabic_letter(token["word"]), # without arabic letters lambda token, i: normalize_Alif_lam(phrase.split()[i]) == normalize_Alif_lam(token["word"]), # without Alif-lam lambda token, i: normalize_arabic_letter(normalize_Alif_lam(phrase.split()[i])) == normalize_arabic_letter(normalize_Alif_lam(token["word"])), # without arabic letters and Alif-lam lambda token, i: normalize_arabic_letter(normalize_LBKF(phrase.split()[i])) == normalize_arabic_letter(normalize_LBKF(token["word"])), lambda token, i: isri.stem(phrase.split()[i]) == token["stem"], # stemed ] matched = [] for aya, token_list in tokens.items(): for token in token_list: for match in matchings: if match(token): matched.append(("{0}_{1}_{2}-{2}".format(book, aya, token["id"]), token["word"])) break if len(matched) == 1: return matched[0] matched = [] if len(phrase.split()) == 2: for aya, token_list in tokens.items(): for token1 in token_list: for token2 in token_list: if token2["id"] == token1["id"] + 1: for match1 in matchings2: if match1(token1, 0): for match2 in matchings2: if match2(token2, 1): matched.append( ( "{0}_{1}_{2}-{3}".format(book, aya, token1["id"], token2["id"]), "{0} {1}".format(token1["word"], token2["word"]), ) ) break break if len(matched) == 1: return matched[0] return None
def resolve_phrase(phrase, tokens, book): phrase = simple_aya(phrase.strip()).replace('', '').replace('ّ', '') if len(phrase) < 3: return None normalize_Alif_lam = lambda s: s[2:] if (s[:2] == 'ال') else s normalize_arabic_letter = lambda s: s.replace('ة', 'ه').replace( 'ؤ', 'و').replace('إ', 'ا').replace('أ', 'ا') normalize_LBKF = lambda s: s[1:] if (s[:1] in 'لبکف') else s matchings = [ lambda token: phrase == token['word'], # exact lambda token: normalize_arabic_letter(phrase) == normalize_arabic_letter(token['word']), # without arabic letters lambda token: normalize_Alif_lam(phrase) == normalize_Alif_lam(token[ 'word']), # without Alif-lam lambda token: normalize_arabic_letter(normalize_Alif_lam(phrase)) == normalize_arabic_letter(normalize_Alif_lam(token['word']) ), # without arabic letters and Alif-lam lambda token: normalize_arabic_letter(normalize_LBKF( phrase)) == normalize_arabic_letter(normalize_LBKF(token['word'])), lambda token: isri.stem(phrase) == token['stem'] # stemed ] matchings2 = [ lambda token, i: phrase.split()[i] == token['word'], # exact lambda token, i: normalize_arabic_letter(phrase.split()[i]) == normalize_arabic_letter(token['word']), # without arabic letters lambda token, i: normalize_Alif_lam(phrase.split()[ i]) == normalize_Alif_lam(token['word']), # without Alif-lam lambda token, i: normalize_arabic_letter( normalize_Alif_lam(phrase.split()[i])) == normalize_arabic_letter( normalize_Alif_lam(token['word']) ), # without arabic letters and Alif-lam lambda token, i: normalize_arabic_letter( normalize_LBKF(phrase.split()[i])) == normalize_arabic_letter( normalize_LBKF(token['word'])), lambda token, i: isri.stem(phrase.split()[i]) == token['stem' ] # stemed ] matched = [] for aya, token_list in tokens.items(): for token in token_list: for match in matchings: if match(token): matched.append( ('{0}_{1}_{2}-{2}'.format(book, aya, token['id']), token['word'])) break if len(matched) == 1: return matched[0] matched = [] if len(phrase.split()) == 2: for aya, token_list in tokens.items(): for token1 in token_list: for token2 in token_list: if token2['id'] == token1['id'] + 1: for match1 in matchings2: if match1(token1, 0): for match2 in matchings2: if match2(token2, 1): matched.append( ('{0}_{1}_{2}-{3}'.format( book, aya, token1['id'], token2['id']), '{0} {1}'.format( token1['word'], token2['word']))) break break if len(matched) == 1: return matched[0] return None