def extract_narrators(hadith: Verse) -> List[str]:
    narrators = []
    first_line = hadith.text[0]

    # Step 1: Extract from beginning to the first qaal
    narrators_text_match = NARRATORS_TEXT_PATTERN.match(first_line)
    if narrators_text_match:
        while narrators_text_match:
            end_index = narrators_text_match.end(0)
            if NARRATORS_TEXT_CONTINUE_PATTERN.match(first_line, end_index):
                narrators_text_match = NARRATORS_TEXT_PATTERN.match(first_line, end_index)
            else:
                break
        narrators_text = first_line[:end_index]
        hadith_text = first_line[end_index:]
        hadith.text[0] = hadith_text
        if not hadith.narrator_chain:
            hadith.narrator_chain = NarratorChain()
            hadith.narrator_chain.parts = []
        hadith.narrator_chain.text = narrators_text

        # Step 2: trim unwanted prefixes
        narrators_with_prefix = narrators_text[:-6]
        if narrators_with_prefix:
            narrators_without_prefix = SKIP_PREFIX_PATTERN.sub('', narrators_with_prefix)
            # Step 3: split the text to get narrators
            narrators = NARRATOR_SPLIT_PATTERN.split(narrators_without_prefix)
    else:
        logger.warn("Could not find narrators for %s", hadith.path)

    return narrators
def assert_text_narrators(text, narrators):
    v = Verse()
    v.text = [text]
    actual = extract_narrators(v)
    print('####################### EXPECTED')
    pprint(narrators)
    print('####################### ACTUAL')
    pprint(actual)
    assert actual == narrators
Esempio n. 3
0
def build_verses(file):
    logger.info("Adding Quran file %s", file)

    index = 0
    verses = []
    with open(file, 'r', encoding='utf8') as qfile:
        for line in qfile.readlines():
            text = line.strip()
            if text and not text.startswith('#'):
                index = index + 1
                verse = Verse()
                verse.part_type = PartType.Verse
                # verse.index=index
                verse.text = [text]
                verse.translations = {}

                verses.append(verse)

    return verses
def update_refs(quran: Chapter, hadith: Verse, quran_refs: Set):
    qrefs = set()
    for (sura_no, verse_no) in quran_refs:
        try:
            sura = quran.chapters[sura_no - 1]
            verse = sura.verses[verse_no - 1]
            if not verse.relations:
                verse.relations = {"Mentioned In": set()}
            verse.relations["Mentioned In"].add(hadith.path)
            qrefs.add(f"/books/quran:{sura_no}:{verse_no}")
        except IndexError:
            logger.warn(
                f"Quran ref does not exist. Hadith {hadith.path} ref {sura_no}:{verse_no}"
            )
    if qrefs:
        hadith.relations = {'Mentions': qrefs}
Esempio n. 5
0
def add_hadith(chapter: Chapter, hadith_ar: List[str], hadith_en: List[str], part_type: PartType = PartType.Hadith):
	hadith = Verse()
	hadith.part_type = part_type
	hadith.text = hadith_ar

	text_en = [END_OF_HADITH_CLEANUP_PATTERN.sub('', txt) for txt in hadith_en]
	
	hadith.translations = {}
	hadith.translations[HUBEALI_TRANSLATION_ID] = text_en
	
	chapter.verses.append(hadith)
Esempio n. 6
0
def build_alhassanain_baabs(file) -> List[Chapter]:
    baabs: List[Chapter] = []
    logger.info("Adding Al-Kafi file %s", file)

    translation = Translation()
    translation.name = "HubeAli.com"
    translation.lang = Language.EN.value
    translation.id = HUBEALI_TRANSLATION_ID

    with open(file, 'r', encoding='utf8') as qfile:
        inner_html = qfile.read()
        sections = inner_html.split("<br clear=all>")
        for section in sections:
            section_soup = BeautifulSoup(section, 'html.parser')

            headings = section_soup.select(".Heading1Center")
            if not headings:
                continue

            # process "the book of" chapter
            baab_titles = extract_headings(headings)

            en_title = baab_titles[Language.EN.value]

            baab = None
            for existing_baab in baabs:
                if existing_baab.titles[Language.EN.value] == en_title:
                    baab = existing_baab

            if not baab:
                baab = Chapter()
                baab.part_type = PartType.Book
                baab.titles = baab_titles
                baab.chapters = []

                baabs.append(baab)

            # process chapters
            chapters = section_soup.select(".Heading2Center")
            chapters_len = len(chapters)
            for subchapter_index in range(math.ceil(chapters_len / 2)):
                subchapter_heading_index = subchapter_index * 2

                remaining_chapters = chapters[subchapter_heading_index:]
                if len(remaining_chapters) > 1:
                    remaining_chapters = remaining_chapters[:2]
                chapter_titles = extract_headings(remaining_chapters)

                chapter = Chapter()
                chapter.part_type = PartType.Chapter
                chapter.titles = chapter_titles
                chapter.verse_translations = [translation]
                chapter.verses = []

                baab.chapters.append(chapter)

                last_element = remaining_chapters[-1]
                last_element = last_element.next_sibling

                verse: Verse = None
                while (last_element is not None and
                       (isinstance(last_element, NavigableString) or
                        (is_tag(last_element)
                         and 'Heading2Center' not in last_element['class']))):
                    is_a_tag = is_tag(last_element)
                    if is_a_tag and 'libAr' in last_element['class']:

                        # push the last verse if its not the start of chapter
                        if verse != None:
                            chapter.verses.append(verse)

                        verse = Verse()
                        verse.part_type = PartType.Hadith
                        verse.translations = {}
                        verse.translations[HUBEALI_TRANSLATION_ID] = []

                        verse.text = [last_element.get_text(strip=True)]

                    if is_a_tag and 'libNormal' in last_element['class']:
                        verse.translations[HUBEALI_TRANSLATION_ID].append(
                            last_element.get_text(strip=True))

                    last_element = last_element.next_sibling

                if verse != None:
                    chapter.verses.append(verse)

    return baabs
def add_chapter_content(chapter: Chapter, filepath, hadith_index=0):
    if filepath.endswith('\\0.html'):
        error_msg = f"Skipping zero file {filepath}"
        logger.warn(error_msg)
        SEQUENCE_ERRORS.append(error_msg)
        return

    verses = chapter.verses
    heading_count = len([x for x in verses if x.part_type == PartType.Heading])

    sarwar_exists = next((item for item in chapter.verse_translations
                          if item.id == SARWAR_TRANSLATION_ID), None)
    if not sarwar_exists:
        chapter.verse_translations.append(sarwar_translation)

    with open(filepath, 'r', encoding='utf8') as qfile:
        file_html = qfile.read()

        if not 'en' in chapter.titles:
            file_soup = BeautifulSoup(file_html, 'html.parser')

            card_body = file_soup.find('div', 'card-body')
            chapter_title = get_contents(card_body.find('h3'))
            chapter.titles['en'] = chapter_title

        ##### Processing each hadith separately

        hadith_htmls = re.split('<hr/?>', file_html)

        for hadith_html in hadith_htmls:
            if we_dont_care(hadith_html):
                continue

            soup = BeautifulSoup(hadith_html, 'html.parser')

            all_paras = soup.find_all('p')

            para_index = 0
            hadith_ar = []
            while is_rtl_tag(all_paras[para_index]):
                hadith_ar.append(get_contents(all_paras[para_index]))
                para_index += 1

            hadith_en = get_contents(all_paras[para_index])
            para_index += 1

            if hadith_index >= len(verses) - heading_count:
                # hubeali rightly splits first chapter in book of inheritance into two
                # but thaqalayn.net has it as one chapter, so we'll skip adding ahadith
                if chapter.path == '/books/al-kafi:7:2:1':
                    break

                verse = Verse()
                verse.text = hadith_ar
                verse.part_type = PartType.Hadith.value
                verse.translations = {}

                verses.append(verse)

                site_path = sitepath_from_filepath(filepath)
                if chapter.crumbs:
                    my_site_path = chapter.crumbs[-1].path
                else:
                    my_site_path = site_path.replace('/', ':')
                error_msg = f"Appending new hadith from Sarwar to hubeali, hadith #{hadith_index+1} from https://thaqalayn.net/chapter/{site_path} to https://thaqalayn.netlify.app/#{my_site_path}"
                logger.warn(error_msg)
                SEQUENCE_ERRORS.append(error_msg)
            else:
                # TODO: create new verse if the verse at this index doesn't match the one being inserted
                # perhaps use https://github.com/ztane/python-Levenshtein or https://pypi.org/project/jellyfish/
                verse = verses[hadith_index]

                if verse.part_type == PartType.Heading:
                    hadith_index += 1
                    verse = verses[hadith_index]

                if verse.part_type != PartType.Hadith:
                    error_msg = f"Hadith index {hadith_index} is of part_type {verse.part_type} in https://thaqalayn.netlify.app/#{chapter.crumbs[-1].path}"
                    logger.warn(error_msg)
                    SEQUENCE_ERRORS.append(error_msg)

            verse.translations[SARWAR_TRANSLATION_ID] = [hadith_en]

            if len(all_paras) > para_index + 1:
                grading_title = get_contents(all_paras[para_index])
                para_index += 1
                if grading_title.startswith('Grading:'):
                    grading = []
                    # if len(all_paras[3:-3]) != 2 and len(all_paras[3:-3]) != 1:
                    # 	raise Exception("We are in " + filepath + " and all_paras is " + str(all_paras))
                    for grading_para in all_paras[para_index:-3]:
                        grading.append(get_contents(grading_para))
                    verse.gradings = grading

            hadith_index += 1

    # Volume 8 of al-kafi is one file per hadith on thaqalayn.net and it'll warn on every page
    # since there is always more ahadith on hubeali's chapter
    if hadith_index != len(
            verses) - heading_count and 'al-kafi:8:1' not in chapter.path:
        site_path = sitepath_from_filepath(filepath)
        error_msg = f"Sarwar has {hadith_index} hadith but hubeali has {len(verses)} hadith: https://thaqalayn.net/chapter/{site_path} vs https://thaqalayn.netlify.app/#{chapter.crumbs[-1].path}"
        logger.warn(error_msg)
        SEQUENCE_ERRORS.append(error_msg)