Ejemplo n.º 1
0
def replace_with_base64(s, ref):
    words_to_replace = []
    cleaned_pasuk = tokenizer(s, True)
    for iw, word in enumerate(cleaned_pasuk):
        prefix = ''
        shoresh = lookup_shoresh(word, ref)
        if shoresh:
            shoresh = shoresh[0]
        else:
            print(word)
        if any(word_to_emoji == shoresh for word_to_emoji in words_to_emojis):
            nikudless_word = strip_cantillation(word, True)[:-1]
            nikudless_shoresh = strip_cantillation(shoresh, True)[:-1]
            if len(nikudless_shoresh) > len(nikudless_word):
                nikudless_shoresh = nikudless_shoresh[:len(nikudless_word)]
            if nikudless_word != nikudless_shoresh:
                prefix_index = nikudless_word.find(nikudless_shoresh)
                if prefix_index != -1 and any(p == nikudless_word[:prefix_index] for p in prefixes):
                    nikud_prefix_index = word.find(shoresh[0], prefix_index)
                    prefix = word[:nikud_prefix_index]
            words_to_replace += [{"name": word, "shoresh": shoresh, "prefix": prefix, "word_num": iw}]
    tokenized_pasuk = tokenizer(s, False)
    for to_replace in words_to_replace:
        p = to_replace["prefix"]
        tokenized_pasuk[to_replace[
            "word_num"]] = u'<span class="purim-emoji">' \
                           u'{}<img  src="data:image/png;base64,{}" /> </span>'.format(
            u"{}-".format(p) if len(p) > 0 else u"", emoji_map[to_replace["shoresh"]])
    new_pasuk = rebuild_tokenized_text(tokenized_pasuk)
    if new_pasuk[-1] != u'׃':
        new_pasuk += u'׃'
    return new_pasuk
Ejemplo n.º 2
0
    def _single_lookup(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and forms.count() == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        if forms.count() > 0:
            result = []
            headword_query = []
            for form in forms:
                for lookup in form.lookups:
                    headword_query.append({'headword': lookup['headword']})
                    # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms
            return headword_query
        else:
            return []
Ejemplo n.º 3
0
def make_parsed_source(chapter_name, chapter_num, topic_name, topic_num,
                       source_num, prev_rows):
    global TOTAL_REFS, PARSED_REFS
    source, commentary = "", ""
    for r in prev_rows:
        s, c = get_midrashic_text(r["source"])
        source += s
        commentary += c
    m = re.search(r"\(([^)]+)\)\s*\.?\s*\$?\s*$", source)
    if m is None:
        print("OH NO -- {} {} {}: {}".format(chapter_num, topic_num,
                                             source_num, topic_name))
        print(strip_cantillation(source, strip_vowels=True)[-20:])
        return None
    else:
        source = re.sub(r"\(([^)]+)\)\s*\.?\s*\$?\s*$", "", source)
        ref_list = [
            parse_ref(r, source) for r in re.split(r"[:;]", m.group(1))
        ]
        TOTAL_REFS += len(ref_list)
        PARSED_REFS += len([_f for _f in ref_list if _f])
    return {
        "chapter_name": chapter_name,
        "chapter_num": chapter_num,
        "topic_name": topic_name,
        "topic_num": topic_num,
        "source_num": source_num,
        "source": source,
        "commentary": commentary,
        "ref_list": [_f for _f in ref_list if _f]
    }
Ejemplo n.º 4
0
    def _single_lookup(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        if len(forms) > 0:
            headword_query = []
            for form in forms:
                for lookup in form.lookups:
                    headword_query.append({'headword': lookup['headword']})
                    # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms
            return headword_query
        else:
            return []
Ejemplo n.º 5
0
 def tokenize_words(self, base_str):
     base_str = base_str.strip()
     base_str = strip_cantillation(base_str, strip_vowels=True)
     base_str = bleach.clean(base_str, tags=[], strip=True)
     for match in re.finditer(ur'\(.*?\)', base_str):
         if library.get_titles_in_string(match.group()) and len(match.group().split()) <= 5:
             base_str = base_str.replace(match.group(), u"")
Ejemplo n.º 6
0
 def tokenize_words(self, base_str):
     base_str = base_str.strip()
     base_str = strip_cantillation(base_str, strip_vowels=True)
     base_str = bleach.clean(base_str, tags=[], strip=True)
     for match in re.finditer(ur'\(.*?\)', base_str):
         if library.get_titles_in_string(
                 match.group()) and len(match.group().split()) <= 5:
             base_str = base_str.replace(match.group(), u"")
Ejemplo n.º 7
0
def tokenizer(base_str, clean=False):
    base_str = base_str.strip()
    if clean:
        base_str = base_str.replace(u"׀", u"$$$")
        base_str = bleach.clean(base_str, tags=[], strip=True)
        base_str = strip_cantillation(base_str, strip_vowels=False)
    base_str = re.sub(ur'־', u' *־* ', base_str)
    word_list = re.split(ur"\s+", base_str)
    return word_list
Ejemplo n.º 8
0
    def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories):
        """
        Create a document for indexing from the text specified by ref/version/lang
        """
        # Don't bother indexing if there's no content
        if not content:
            return False

        content_wo_cant = strip_cantillation(content, strip_vowels=False).strip()
        content_wo_cant = re.sub(r'<[^>]+>', '', content_wo_cant)
        content_wo_cant = re.sub(r'\([^)]+\)', '', content_wo_cant)  # remove all parens
        if len(content_wo_cant) == 0:
            return False

        oref = Ref(tref)
        toc_tree = library.get_toc_tree()
        cats = oref.index.categories

        indexed_categories = categories  # the default

        # get the full path of every cat along the way.
        # starting w/ the longest,
        # check if they're root swapped.
        paths = [cats[:i] for i in range(len(cats), 0, -1)]
        for path in paths:
            cnode = toc_tree.lookup(path)
            if getattr(cnode, "searchRoot", None) is not None:
                # Use the specified searchRoot, with the rest of the category path appended.
                indexed_categories = [cnode.searchRoot] + cats[len(path) - 1:]
                break

        tp = cls.best_time_period
        if tp is not None:
            comp_start_date = int(tp.start)
        else:
            comp_start_date = 3000  # far in the future

        ref_data = RefData().load({"ref": tref})
        pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGESHEETRANK

        return {
            "ref": tref,
            "heRef": heTref,
            "version": version,
            "lang": lang,
            "version_priority": version_priority if version_priority is not None else 1000,
            "titleVariants": oref.index_node.all_tree_titles("en"),
            "categories": indexed_categories,
            "order": oref.order_id(),
            "path": "/".join(indexed_categories + [cls.curr_index.title]),
            "pagesheetrank": pagesheetrank,
            "comp_date": comp_start_date,
            #"hebmorph_semi_exact": content_wo_cant,
            "exact": content_wo_cant,
            "naive_lemmatizer": content_wo_cant,
        }
Ejemplo n.º 9
0
 def clean(self, s):
     s = unicodedata.normalize("NFD", s)
     s = strip_cantillation(s, strip_vowels=True)
     s = re.sub(u"(^|\s)(?:\u05d4['\u05f3])($|\s)", u"\1יהוה\2", s)
     s = re.sub(ur"[,'\":?.!;־״׳]", u" ", s)
     s = re.sub(ur"\([^)]+\)", u" ", s)
     # s = re.sub(ur"\((?:\d{1,3}|[\u05d0-\u05ea]{1,3})\)", u" ", s)  # sefaria automatically adds pasuk markers. remove them
     s = bleach.clean(s, strip=True, tags=()).strip()
     s = u" ".join(s.split())
     return s
Ejemplo n.º 10
0
def base_tokenizer(str):
    punc_pat = re.compile(ur"(\.|,|:|;)$")

    str = re.sub(ur"\([^\(\)]+\)", u"", str)
    str = re.sub(ur"''",ur'"',str) # looks like double apostrophe in shulchan arukh is meant to be a quote
    str = re.sub(r"</?[a-z]+>", "", str)  # get rid of html tags
    str = hebrew.strip_cantillation(str, strip_vowels=True)
    word_list = re.split(ur"\s+", str)
    word_list = [re.sub(punc_pat,u"",w).strip() for w in word_list if len(re.sub(punc_pat,u"",w).strip()) > 0]  # remove empty strings and punctuation at the end of a word
    return word_list
Ejemplo n.º 11
0
def extract_steinsaltz_possibilities(base_text, stein_text, word_punct_pairs):
    """
    strip off intro
    any punct found in bold is extracted separately
    any punct at end of bold or in commentary is combined and any subset is possible (accounting for order that punct appeared)
    """

    # algorithm works better when bold tags are consolidated
    stein_text = re.sub(r'</b>(\s*)<b>', r'\g<1>', stein_text)
    maps = build_maps(base_text, stein_text)
    talmud_word_index = 0
    for ts_map in maps.suite:
        if not ts_map.actually_has_talmud() and (
                isinstance(ts_map.talmud_steinsaltz, SteinsaltzIntro)
                or isinstance(ts_map.talmud_steinsaltz, ConnectedTalmud)):
            continue
        talmud_words = split_by_type(ts_map.reg_talmud, 'words')
        talmud_poss_dict = get_talmud_punct_possibilities(
            ts_map.talmud_steinsaltz.talmud)

        # all punct in talmud portion of stein can theoretically be on any word in talmud (since we don't have a word-to-word mapping)
        for i, (tw1, pair_dict) in enumerate(
                zip(
                    talmud_words,
                    word_punct_pairs[talmud_word_index:talmud_word_index +
                                     len(talmud_words)])):
            tw2 = pair_dict['Word']
            assert tw1 == strip_cantillation(
                tw2, strip_vowels=True), f"{tw1}--{tw2}"
            word_punct_pairs[talmud_word_index +
                             i]['Punct Possibilities'] += talmud_poss_dict[
                                 'Punct Possibilities']
            word_punct_pairs[talmud_word_index +
                             i]['Pre-quote?'] |= talmud_poss_dict['Pre-quote?']
            word_punct_pairs[
                talmud_word_index +
                i]['Post-quote?'] |= talmud_poss_dict['Post-quote?']
            word_punct_pairs[talmud_word_index +
                             i]['Dash?'] |= talmud_poss_dict['Dash?']

        # last word can have punctuation on it + any combo of punctuation in stein
        stein_poss_dict = get_talmud_punct_possibilities(
            ts_map.talmud_steinsaltz.steinsaltz)
        word_punct_pairs[talmud_word_index + len(talmud_words) - 1][
            'Punct Possibilities'] += talmud_poss_dict['Punct Possibilities']
        word_punct_pairs[talmud_word_index + len(talmud_words) -
                         1]['Pre-quote?'] |= talmud_poss_dict['Pre-quote?']
        word_punct_pairs[talmud_word_index + len(talmud_words) -
                         1]['Post-quote?'] |= talmud_poss_dict['Post-quote?']
        word_punct_pairs[talmud_word_index + len(talmud_words) -
                         1]['Dash?'] |= talmud_poss_dict['Dash?']
        talmud_word_index += len(talmud_words)
    return word_punct_pairs
Ejemplo n.º 12
0
def base_tokenizer(str):
    punc_pat = re.compile(ur"(\.|,|:)$")

    str = re.sub(ur"\([^\(\)]+\)", u"", str)
    str = re.sub(r"</?[a-z]+>", "", str)  # get rid of html tags
    str = hebrew.strip_cantillation(str, strip_vowels=True)
    word_list = re.split(ur"\s+", str)
    word_list = [
        re.sub(punc_pat, u"", w).strip() for w in word_list
        if len(re.sub(punc_pat, u"", w).strip()) > 0
    ]  # remove empty strings and punctuation at the end of a word
    return word_list
Ejemplo n.º 13
0
def lookup_shoresh(w, ref):
    # in both - cant
    # only second - cant
    # only first - nikud
    #remove all non-Hebrew non-nikud characters (including cantillation and sof-pasuk)
    w = strip_cantillation(w, strip_vowels=False)
    w = re.sub(ur"[A-Za-z׃׀־]", u"", w)
    lexicon = "BDB Augmented Strong"
    wf = WordForm().load({"form": w, "refs": re.compile("^" + ref + "$")})
    if wf:
        return map(lambda x: x["headword"],
                   filter(lambda x: x["lexicon"] == lexicon, wf.lookups))
Ejemplo n.º 14
0
 def body(self):
     self.load_ref("Job 1")
     expected_heb = 'אִ֛ישׁ הָיָ֥ה בְאֶֽרֶץ־ע֖וּץ אִיּ֣וֹב שְׁמ֑וֹ וְהָיָ֣ה ׀ הָאִ֣ישׁ הַה֗וּא תָּ֧ם וְיָשָׁ֛ר וִירֵ֥א אֱלֹהִ֖ים וְסָ֥ר מֵרָֽע׃'
     expected_eng_closed = 'There was a man in the land of Uz named Job. That man was blameless and upright; he feared God and shunned evil.'
     expected_eng_open = 'THERE was a man in the land of Uz, whose name was Job; and that man was whole-hearted and upright, and one that feared God, and shunned evil.'
     sgmnt_eng = self.get_nth_section_english(1)
     sgmnt_heb = self.get_nth_section_hebrew(1)
     str_eng = sgmnt_eng.text.strip()
     str_heb = sgmnt_heb.text.strip()
     # not sure why, but he strings aren't equal unless vowels are stripped
     expected_heb_stripped = strip_cantillation(expected_heb,
                                                strip_vowels=True)
     str_heb_stripped = strip_cantillation(str_heb, strip_vowels=True)
     assert expected_heb_stripped == str_heb_stripped, "'{}' does not equal '{}'".format(
         expected_heb_stripped, str_heb_stripped)
     assert str_eng in [expected_eng_open, expected_eng_closed
                        ], "'{}' does not equal '{}' or '{}'".format(
                            str_eng, expected_eng_closed, expected_eng_open)
     self.toggle_on_text_settings()
     self.toggle_language_hebrew()
     assert 'hebrew' in self.get_content_language()
     assert 'english' not in self.get_content_language()
     assert 'bilingual' not in self.get_content_language()
     assert self.has_hebrew_text() == True
     assert self.has_english_text() == False
     self.toggle_on_text_settings()
     self.toggle_language_english()
     assert 'hebrew' not in self.get_content_language()
     assert 'english' in self.get_content_language()
     assert 'bilingual' not in self.get_content_language()
     assert self.has_hebrew_text() == False
     assert self.has_english_text() == True
     self.toggle_on_text_settings()
     self.toggle_language_bilingual()
     assert 'hebrew' not in self.get_content_language()
     assert 'english' not in self.get_content_language()
     assert 'bilingual' in self.get_content_language()
     assert self.has_hebrew_text() == True
     assert self.has_english_text() == True
     self.get_content_language()
Ejemplo n.º 15
0
    def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories):
        """
        Create a document for indexing from the text specified by ref/version/lang
        """
        oref = Ref(tref)
        text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents()

        if not content:
            # Don't bother indexing if there's no content
            return False

        content_wo_cant = strip_cantillation(content, strip_vowels=False).strip()
        content_wo_cant = re.sub(r'<[^>]+>', '', content_wo_cant)
        content_wo_cant = re.sub(r'\([^)]+\)', '', content_wo_cant)  # remove all parens
        if len(content_wo_cant) == 0:
            return False

        if getattr(cls.curr_index, "dependence", None) == 'Commentary' and "Commentary" in text["categories"]:  # uch, special casing
            temp_categories = text["categories"][:]
            temp_categories.remove('Commentary')
            temp_categories[0] += " Commentaries"  # this will create an additional bucket for each top level category's commentary
        else:
            temp_categories = categories

        tp = cls.best_time_period
        if not tp is None:
            comp_start_date = int(tp.start)
        else:
            comp_start_date = 3000  # far in the future

        # section_ref = tref[:tref.rfind(u":")] if u":" in tref else (tref[:re.search(ur" \d+$", tref).start()] if re.search(ur" \d+$", tref) is not None else tref)

        ref_data = RefData().load({"ref": tref})
        pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGERANK * RefData.DEFAULT_SHEETRANK

        return {
            "ref": tref,
            "heRef": heTref,
            "version": version,
            "lang": lang,
            "version_priority": version_priority if version_priority is not None else 1000,
            "titleVariants": text["titleVariants"],
            "categories": temp_categories,
            "order": oref.order_id(),
            "path": "/".join(temp_categories + [cls.curr_index.title]),
            "pagesheetrank": pagesheetrank,
            "comp_date": comp_start_date,
            #"hebmorph_semi_exact": content_wo_cant,
            "content": content_wo_cant if cls.merged else "",  # backwards compat for android
            "exact": content_wo_cant,
            "naive_lemmatizer": content_wo_cant,
        }
Ejemplo n.º 16
0
    def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories):
        """
        Create a document for indexing from the text specified by ref/version/lang
        """
        oref = Ref(tref)
        text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents()

        if not content:
            # Don't bother indexing if there's no content
            return False

        content_wo_cant = strip_cantillation(content, strip_vowels=False).strip()
        content_wo_cant = re.sub(ur'<[^>]+>', u'', content_wo_cant)
        content_wo_cant = re.sub(ur'\([^)]+\)', u'', content_wo_cant)  # remove all parens
        if len(content_wo_cant) == 0:
            return False

        if getattr(cls.curr_index, "dependence", None) == 'Commentary' and "Commentary" in text["categories"]:  # uch, special casing
            temp_categories = text["categories"][:]
            temp_categories.remove('Commentary')
            temp_categories[0] += " Commentaries"  # this will create an additional bucket for each top level category's commentary
        else:
            temp_categories = categories

        tp = cls.best_time_period
        if not tp is None:
            comp_start_date = int(tp.start)
        else:
            comp_start_date = 3000  # far in the future

        # section_ref = tref[:tref.rfind(u":")] if u":" in tref else (tref[:re.search(ur" \d+$", tref).start()] if re.search(ur" \d+$", tref) is not None else tref)

        ref_data = RefData().load({"ref": tref})
        pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGERANK * RefData.DEFAULT_SHEETRANK

        return {
            "ref": tref,
            "heRef": heTref,
            "version": version,
            "lang": lang,
            "version_priority": version_priority if version_priority is not None else 1000,
            "titleVariants": text["titleVariants"],
            "categories": temp_categories,
            "order": oref.order_id(),
            "path": "/".join(temp_categories + [cls.curr_index.title]),
            "pagesheetrank": pagesheetrank,
            "comp_date": comp_start_date,
            #"hebmorph_semi_exact": content_wo_cant,
            "content": content_wo_cant if cls.merged else u"",  # backwards compat for android
            "exact": content_wo_cant,
            "naive_lemmatizer": content_wo_cant,
        }
Ejemplo n.º 17
0
def lookup_shoresh(w, ref):
    # in both - cant
    # only second - cant
    # only first - nikud
    #remove all non-Hebrew non-nikud characters (including cantillation and sof-pasuk)
    w = strip_cantillation(w, strip_vowels=False)
    w = re.sub(ur"[A-Za-z׃׀־]", u"", w)
    lexicon = "BDB Augmented Strong"
    try:
        wf = WordForm().load({"form": w, "refs": re.compile("^" + ref + "$")})
    except Exception:
        return None
    if wf:
        return map(lambda x: x["headword"], filter(lambda x: x["lexicon"] == lexicon, wf.lookups))
Ejemplo n.º 18
0
def get_snippet_by_seg_ref(source, found):
    """
    based off of library.get_wrapped_refs_string
    :param source:
    :param found:
    :return:
    """
    found_title = found.index.get_title("he")
    found_node = library.get_schema_node(found_title, "he")
    title_nodes = {t: found_node for t in found.index.all_titles("he")}
    all_reg = library.get_multi_title_regex_string(
        set(found.index.all_titles("he")), "he")
    reg = regex.compile(all_reg, regex.VERBOSE)
    source_text = strip_cantillation(source.text("he").text, strip_vowels=True)

    linkified = library._wrap_all_refs_in_string(title_nodes, reg, source_text,
                                                 "he")

    snippets = []
    for match in re.finditer(u"(<a [^>]+>)([^<]+)(</a>)", linkified):
        ref = Ref(match.group(2))
        if ref.normal() == found.section_ref().normal() or ref.normal(
        ) == found.normal():
            start_snip_naive = match.start(2) - 100 if match.start(
                0) >= 100 else 0
            start_snip = linkified.rfind(u" ", 0, start_snip_naive)
            if start_snip == -1:
                start_snip = start_snip_naive
            end_snip_naive = match.end(2) + 100 if match.end(0) + 100 <= len(
                linkified) else len(linkified)
            end_snip = linkified.find(u" ", end_snip_naive)
            if end_snip == -1:
                end_snip = end_snip_naive
            snippets += [
                bleach.clean(linkified[start_snip:end_snip],
                             tags=[],
                             strip=True)
            ]

    if len(snippets) == 0:
        print "zero"
        print found
        linkified = library._wrap_all_refs_in_string(title_nodes, reg,
                                                     source_text, "he")

    if len(snippets) == 0:
        return [source_text]
    return snippets
Ejemplo n.º 19
0
def make_text_index_document(tref, version, lang):
    """
    Create a document for indexing from the text specified by ref/version/lang
    """
    oref = Ref(tref)
    text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents()

    content = text["he"] if lang == 'he' else text["text"]
    if not content:
        # Don't bother indexing if there's no content
        return False

    if isinstance(content, list):
        content = " ".join(content)

    content = bleach.clean(content, strip=True, tags=())
    content = strip_cantillation(content,strip_vowels=True)

    index = oref.index

    tp = index.best_time_period()
    if not tp is None:
        comp_start_date = int(tp.start)
    else:
        comp_start_date = 3000




    return {
        "ref": oref.normal(),
        "ref_order": oref.order_id(),
        "comp_date_int": comp_date_curve(comp_start_date),
        "pagerank": math.log(pagerank_dict[oref.normal()]) + 20 if oref.normal() in pagerank_dict else 1.0,
        "pagerank-original": pagerank_dict[oref.normal()] if oref.normal() in pagerank_dict else 1E-8,
        "version": version,
        "lang": lang,
        "hebmorph-standard": content,
        "hebmorph-exact": content,
        "hebmorph-standard-no-norm": content,
        "hebmorph-exact-no-norm": content,
        "ngram": content,
        "infreq": content,
        "aggresive-ngram": content,
        "naive-lemmatizer": content,
        "comp-date": comp_start_date,
        "original": content
    }
Ejemplo n.º 20
0
def clean(s):
    if len(s) == 0:
        return s
    s = unicodedata.normalize("NFD", s)
    s = strip_cantillation(s, strip_vowels=True)

    # please forgive me...
    # replace common hashem replacements with the tetragrammaton
    s = re.sub(ur"(^|\s)([\u05de\u05e9\u05d5\u05db\u05dc\u05d1]?)(?:\u05d4['\u05f3]|\u05d9\u05d9)($|\s)", ur"\1\2\u05d9\u05d4\u05d5\u05d4\3", s)


    s = re.sub(ur"[,'\":?!;־״׳]", u" ", s)  # purposefully leave out period so we can replace ... later on
    s = re.sub(ur"\([^)]+\)", u" ", s)
    s = re.sub(ur"<[^>]+>", u"", s)
    s = u" ".join(s.split())
    return s
Ejemplo n.º 21
0
def lookup_shoresh(w, ref):
    # in both - cant
    # only second - cant
    # only first - nikud
    #remove all non-Hebrew non-nikud characters (including cantillation and sof-pasuk)
    w = strip_cantillation(w, strip_vowels=False)
    w = re.sub(r"[A-Za-z׃׀־]", "", w)
    lexicon = "BDB Augmented Strong"
    try:
        wf = WordForm().load({"form": w, "refs": re.compile("^" + ref + "$")})
    except Exception:
        return None
    if wf:
        return [
            x["headword"]
            for x in [x for x in wf.lookups if x["lexicon"] == lexicon]
        ]
Ejemplo n.º 22
0
def make_mishnaic_training_context():
    training = []
    mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")]

    mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")]

    for ind in mishnah_indexes:
        mishna_segs = ind.all_section_refs()
        for seg in mishna_segs:
            first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True)
            training += [{'language':'mishnaic', 'phrase': util.tokenize_words(p)} for p in first_sec_str.split(u'. ')]

    total_words = 0
    total_phrases = len(training)
    for p in training:
        total_words += len(p['phrase'])

    print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases)
    return training
Ejemplo n.º 23
0
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
Ejemplo n.º 24
0
def make_mishnaic_training_context():
    training = []
    mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")]

    mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")]

    for ind in mishnah_indexes:
        mishna_segs = ind.all_section_refs()
        for seg in mishna_segs:
            first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True)
            training += [{'language':'mishnaic', 'phrase':tokenize_words(p)} for p in first_sec_str.split(u'. ')]

    total_words = 0
    total_phrases = len(training)
    for p in training:
        total_words += len(p['phrase'])

    print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases)
    return training
Ejemplo n.º 25
0
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a
            # consonantal form was supplied in the first place, this optimizes queries.
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
Ejemplo n.º 26
0
 def lexicon_lookup(cls, input_str, **kwargs):
     input_str = unicodedata.normalize("NFC", input_str)
     results = cls._single_lookup(input_str, **kwargs)
     if not results or kwargs.get('always_consonants', False):
         results += cls._single_lookup(strip_cantillation(input_str, True), lookup_key='c_form', **kwargs)
     if not kwargs.get('never_split', None) and (len(results) == 0 or kwargs.get("always_split", None)):
         ngram_results = cls._ngram_lookup(input_str, **kwargs)
         results += ngram_results
     if len(results):
         primary_tuples = set()
         query = set() #TODO: optimize number of word form lookups? there can be a lot of duplicates... is it needed?
         for r in results:
             # extract the lookups with "primary" field so it can be used for sorting lookup in the LexicinEntrySet,
             # but also delete it, because its not part of the query obj
             if "primary" in r:
                 if r["primary"] is True:
                     primary_tuples.add((r["headword"], r["parent_lexicon"]))
                 del r["primary"]
         return LexiconEntrySet({"$or": results}, primary_tuples=primary_tuples)
     else:
         return None
Ejemplo n.º 27
0
def make_text_index_document(tref, version, lang):
    from sefaria.utils.hebrew import strip_cantillation
    """
    Create a document for indexing from the text specified by ref/version/lang
    """
    oref = Ref(tref)
    text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents()

    content = text["he"] if lang == 'he' else text["text"]
    if not content:
        # Don't bother indexing if there's no content
        return False

    if isinstance(content, list):
        content = flatten_list(content)  # deal with mutli-dimensional lists as well
        content = " ".join(content)

    content = bleach.clean(content, strip=True, tags=())
    content_wo_cant = strip_cantillation(content, strip_vowels=False)

    if re.match(ur'^\s*[\(\[].+[\)\]]\s*$',content):
        return False #don't bother indexing. this segment is surrounded by parens
Ejemplo n.º 28
0
def make_mishnaic_training():
    training = []
    num_mishnah_per_mesechta = 30000  # effectively all mishnah
    mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")]

    mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")]

    mish_set = set()
    num_removed = 0
    for ind in mishnah_indexes:
        mishna_segs = ind.all_section_refs()
        if len(mishna_segs) >= num_mishnah_per_mesechta:
            mishna_segs = mishna_segs[:num_mishnah_per_mesechta]
        for seg in mishna_segs:
            first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True)
            word_list = tokenize_words(first_sec_str)
            for word in word_list:
                if random.random() > 0.5 and word in mish_set:
                    num_removed += 1
                    continue
                training.append({'word':word,'tag':'mishnaic'})
                mish_set.add(word)
    print "Num Mishna removed: {}".format(num_removed)
    return training
Ejemplo n.º 29
0
def tokenize_words_for_tfidf(text, stopwords):
    from sefaria.utils.hebrew import strip_cantillation

    try:
        text = TextChunk._strip_itags(text)
    except AttributeError:
        pass
    text = strip_cantillation(text, strip_vowels=True)
    text = re.sub(r'<[^>]+>', ' ', text)
    for match in re.finditer(r'\(.*?\)', text):
        if len(match.group().split()) <= 5:
            text = text.replace(match.group(), " ")
    text = re.sub(r'־', ' ', text)
    text = re.sub(
        r'\[[^\[\]]{1,7}\]', '', text
    )  # remove kri but dont remove too much to avoid messing with brackets in talmud
    text = re.sub(r'[A-Za-z.,"?!״:׃]', '', text)
    # replace common hashem replacements with the tetragrammaton
    text = re.sub(
        "(^|\s)([\u05de\u05e9\u05d5\u05db\u05dc\u05d1]?)(?:\u05d4['\u05f3]|\u05d9\u05d9)($|\s)",
        "\\1\\2\u05d9\u05d4\u05d5\u05d4\\3", text)
    # replace common elokim replacement with elokim
    text = re.sub(
        "(^|\s)([\u05de\u05e9\u05d5\u05db\u05dc\u05d1]?)(?:\u05d0\u05dc\u05e7\u05d9\u05dd)($|\s)",
        "\\1\\2\u05d0\u05dc\u05d4\u05d9\u05dd\\3", text)
    words = []
    if len(text) != 0:
        # text = requests.post('https://prefix.dicta.org.il/api', data=json.dumps({'data': text})).text
        # text = re.sub(r'(?<=\s|"|\(|\[|-)[\u05d0-\u05ea]+\|', '', ' ' + text)  # remove prefixes
        text = re.sub('[^\u05d0-\u05ea"]', ' ', text)
        words = list(
            filter(lambda w: w not in stopwords, [
                re.sub('^\u05d5', '', w.replace('"', ''))
                for w in text.split()
            ]))
    return words
Ejemplo n.º 30
0
def make_mishnaic_training():
    training = []
    num_mishnah_per_mesechta = 30000  # effectively all mishnah
    mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")]

    mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")]

    mish_set = set()
    num_removed = 0
    for ind in mishnah_indexes:
        mishna_segs = ind.all_section_refs()
        if len(mishna_segs) >= num_mishnah_per_mesechta:
            mishna_segs = mishna_segs[:num_mishnah_per_mesechta]
        for seg in mishna_segs:
            first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True)
            word_list = util.tokenize_words(first_sec_str)
            for word in word_list:
                if random.random() > 0.45 and word in mish_set:
                    num_removed += 1
                    continue
                training.append({'word':word,'tag':'mishnaic'})
                mish_set.add(word)
    print "Num Mishna removed: {}".format(num_removed)
    return training
Ejemplo n.º 31
0
def tag_testing_naive(text_name,bib_links,seg_len_list,word_list_in,ref_list,test_set_name="test"):
    cal_dh_root = "../../dibur_hamatchil/dh_source_scripts/cal_matcher_output"
    jba_count = 0
    curr_state = "" #state should be retained, even b/w dafs
    #caldb_words = json.load(codecs.open("caldb_words_{}.json".format(text_name), "r", encoding="utf-8"))
    for iref,ref in enumerate(ref_list):
        curr_seg_len_list = seg_len_list[iref]
        curr_bib_links = bib_links[iref]
        curr_word_list_in = word_list_in[iref]

        daf = ref.__str__().replace("{} ".format(text_name),"").encode('utf8')

        try:
            cal_pre_tagged_words = \
            json.load(codecs.open("{}/{}/lang_naive_talmud/lang_naive_talmud_{}.json".format(cal_dh_root,text_name,daf), "r", encoding="utf8"))
        except IOError:
            cal_pre_tagged_words = None

        jbaforms = json.load(codecs.open("JBAHashtable.json","rb",encoding='utf8'))

        word_list_out = []
        count = 0
        main_i = 0

        while main_i < len(curr_seg_len_list):
            seg_len = curr_seg_len_list[main_i]
            bib_linkset = curr_bib_links[main_i]
            seg = curr_word_list_in[count:count+seg_len]
            count += seg_len

            b_start = -1; b_end = -1
            if len(bib_linkset) > 0:
                for bib_link in bib_linkset:
                    #there is an assumption here that the links to Tanakh are always 1
                    try:
                        bib_seg = tokenize_words(hebrew.strip_cantillation(Ref(bib_link.refs[1]).text('he').as_string(),strip_vowels=True),strip_html=True)
                        b_start,b_end = match_segments(seg, bib_seg)
                    except InputError:
                        continue
            for i,word in enumerate(seg):
                state_switch_pat = re.compile(r"\<big\>\<strong\>[^\<\>]+\</strong\>\</big\>")
                if re.match(state_switch_pat,word):
                    if curr_state == "mishnaic":
                        curr_state = "talmudic"
                    elif curr_state == "talmudic" or curr_state == "":
                        curr_state = "mishnaic"

                cal_obj = None
                if b_start != -1 and b_end != -1 and i in xrange(b_start,b_end):
                    lang = "biblical"
                elif curr_state == "talmudic":
                    #lang = cal_pre_tagged_words[count-seg_len+i]["class"]
                    if not cal_pre_tagged_words is None:
                        try:
                            cal_obj = cal_pre_tagged_words["words"][count-seg_len+i]
                            if cal_obj["class"] == "unknown":
                                if word in jbaforms and len(jbaforms[word]) == 1 and False:
                                    temp_cal_obj = jbaforms[word][0].copy()
                                    if temp_cal_obj["word"][-1] != "'" and temp_cal_obj["head_word"][-1] != "_":
                                        cal_obj = temp_cal_obj
                                        cal_obj["jba_word"] = cal_obj["word"]
                                        cal_obj["word"] = word
                                        cal_obj["class"] = "talmud"
                                        jba_count += 1
                        except IndexError:
                            break
                elif curr_state == "mishnaic":
                    lang = "mishnaic"
                else:
                    lang = "unknown"

                if cal_obj:
                    word_list_out.append(cal_obj)
                else:
                    word_list_out.append({"word":word,"class":lang})
            main_i += 1
        missed_words = [] if cal_pre_tagged_words is None else cal_pre_tagged_words["missed_words"]
        doc = {"words":word_list_out,"missed_words":missed_words}
        fp = codecs.open("{}/{}/test_set/{}_naive_{}.json".format(cal_dh_root,text_name,test_set_name,daf), "w", encoding='utf-8')
        json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)

    print "NUM JBA WORDS: {}".format(jba_count)
Ejemplo n.º 32
0
def make_training_sets(type):
    if type is "biblical":
        tanach_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Tanakh") if not ind in ("Daniel","Ezra","Nehemia")]
        tanach_dict = {}
        for ind in tanach_indexes:
            all_secs = ind.all_section_refs()
            for sec in all_secs:
                sec_str = hebrew.strip_cantillation(sec.text('he').as_string(),strip_vowels=True)
                word_list = tokenize_words(sec_str)
                for word in word_list:
                    if word:
                        two_letter = get_two_letter_word(word)
                        if two_letter:
                            temp_list = set(tanach_dict[two_letter]) if two_letter in tanach_dict else set()
                            temp_list.add(word)
                            tanach_dict[two_letter] = list(temp_list)
        fp = codecs.open("biblical_2_letters_training.json","w",encoding='utf-8')
        json.dump(tanach_dict, fp,indent=4, encoding='utf-8', ensure_ascii=False)
    elif type is "mishnaic":
        num_mishnah_per_mesechta = 30000 #effectively all mishnah
        mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")]
        mishnah_dict = {}
        for ind in mishnah_indexes:
            mishna_segs = ind.all_section_refs()
            if len(mishna_segs) >= num_mishnah_per_mesechta:
                mishna_segs = mishna_segs[:num_mishnah_per_mesechta]
            for seg in mishna_segs:
                if len(seg.linkset().filter("Tanakh")) > 0:
                    #avoid mishnahs that quote tanakh to not mix languages
                    continue

                first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(),strip_vowels=True)
                word_list = tokenize_words(first_sec_str)
                for word in word_list:
                    if word:
                        two_letter = get_two_letter_word(word)
                        if two_letter:
                            temp_list = set(mishnah_dict[two_letter]) if two_letter in mishnah_dict else set()
                            temp_list.add(word)
                            mishnah_dict[two_letter] = list(temp_list)
        fp = codecs.open("mishnaic_2_letters_training.json","w",encoding='utf-8')
        json.dump(mishnah_dict, fp,indent=4, encoding='utf-8', ensure_ascii=False)
    elif type is "talmudic":
        talmud_dict = {}
        talmud_dbs = {
            ("caldb.txt",cal_tools.parseCalLine),
            ("jbaforms.txt",cal_tools.parseJBALine)
        }
        for db in talmud_dbs:
            with open(db[0],"r") as caldb:
                for line in caldb:
                    line_obj = db[1](line,True,False)
                    try:
                        word = line_obj["word"]
                    except KeyError:
                        print "continuing"
                        continue
                    if word:
                        two_letter = get_two_letter_word(word)
                        if two_letter:
                            temp_list = set(talmud_dict[two_letter]) if two_letter in talmud_dict else set()
                            temp_list.add(word)
                            talmud_dict[two_letter] = list(temp_list)
                    head_word = line_obj["head_word"]
                    if head_word:
                        two_letter = get_two_letter_word(head_word)
                        if two_letter:
                            temp_list = set(talmud_dict[two_letter]) if two_letter in talmud_dict else set()
                            temp_list.add(head_word)
                            talmud_dict[two_letter] = list(temp_list)

        fp = codecs.open("talmudic_2_letters_training.json", "w", encoding='utf-8')
        json.dump(talmud_dict, fp, indent=4, encoding='utf-8', ensure_ascii=False)
Ejemplo n.º 33
0
def tag_testing_naive(text_name,bib_links,seg_len_list,word_list_in,test_set_name="test"):
    curr_state = ""
    caldb_words = json.load(codecs.open("caldb_words_{}.json".format(text_name), "r", encoding="utf-8"))
    cal_words = caldb_words["words"]
    cal_head_words = caldb_words["head_words"]
    word_list_out = []
    count = 0

    main_i = 0
    while main_i < len(seg_len_list):
        seg_len = seg_len_list[main_i]
        bib_linkset = bib_links[main_i]
        seg = word_list_in[count:count+seg_len]
        count += seg_len

        b_start = -1; b_end = -1
        if len(bib_linkset) > 0:
            for bib_link in bib_linkset:
                #there is an assumption here that the links to Tanakh are always 1
                try:
                    bib_seg = tokenize_words(hebrew.strip_cantillation(Ref(bib_link.refs[1]).text('he').as_string(),strip_vowels=True),strip_html=True)
                    b_start,b_end = match_segments(seg, bib_seg)
                except InputError:
                    continue
        for i,word in enumerate(seg):
            state_switch_pat = re.compile(r"\<big\>\<strong\>[^\<\>]+\</strong\>\</big\>")
            if re.match(state_switch_pat,word):
                if curr_state == "mishnaic":
                    curr_state = "talmudic"
                elif curr_state == "talmudic" or curr_state == "":
                    curr_state = "mishnaic"
            if b_start != -1 and b_end != -1 and i in xrange(b_start,b_end):
                lang = "biblical"
            elif curr_state == "talmudic":
                lang = "unknown"

            elif curr_state == "mishnaic":
                lang = "mishnaic"
            else:
                lang = "unknown"
            word_list_out.append({"word":word,"class":lang})
        main_i += 1

    curr_state = ""
    cal_count = 0
    main_i = 0
    num_rounds_without_matches = 0
    last_match_i = 0
    in_backtrack = False
    while main_i < len(word_list_out) and cal_count + 4 < len(cal_words):
        if num_rounds_without_matches > 15:
            if in_backtrack:
                cal_count -= 4
                in_backtrack = False
            else:
                main_i = last_match_i
                cal_count += 4
                in_backtrack = True
            num_rounds_without_matches = 0
            print "back track!"
        yo = 34
        if cal_count == 12:
            yo += 3432443
        temp_tal_words = word_list_out[main_i:main_i+10]
        cal_ngram_list = []
        for inner_cal_count in range(4):
            temp_ngram = Cal_ngram(cal_words[cal_count+inner_cal_count:cal_count+inner_cal_count+4],cal_head_words[cal_count+inner_cal_count:cal_count+inner_cal_count+4],temp_tal_words,main_i,skip_penalty=min(inner_cal_count,1))
            temp_ngram.find()
            cal_ngram_list.append(temp_ngram)
        best_ngram_score = -1
        best_ngram = None
        best_ngram_index = -1
        for i,cng in enumerate(cal_ngram_list):
            if cng.score < best_ngram_score or best_ngram_score == -1:
                best_ngram_score = cng.score
                best_ngram = cng
                best_ngram_index = i
        if len(best_ngram.matched_indexes) == 0:
            temp_tags = best_ngram.curr_tagged_words
            start_match_pos = best_ngram.start_pos
            num_rounds_without_matches += 1
        else:
            in_backtrack = False
            num_rounds_without_matches = 0
            last_match_i = main_i+len(temp_tags)
            cal_count += (best_ngram_index+1)
            start_match_pos = best_ngram.start_pos
            temp_tags = best_ngram.curr_tagged_words[:best_ngram.matched_indexes[0]-start_match_pos+1]
        word_list_out[start_match_pos:start_match_pos+len(temp_tags)] = temp_tags
        main_i += len(temp_tags)

    doc = {}
    doc["words"] = word_list_out
    fp = codecs.open("{}_naive.json".format(test_set_name), "w", encoding='utf-8')
    json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
Ejemplo n.º 34
0
def get_snippet_by_seg_ref(source_tc, found, must_find_snippet=False, snip_size=100, use_indicator_words=False, return_matches=False):
    """
    based off of library.get_wrapped_refs_string
    :param source:
    :param found:
    :param must_find_snippet: bool, True if you only want to return a str if you found a snippet
    :param snip_size int number of chars in snippet on each side
    :param use_indicator_words bool, True if you want to use hard-coded indicator words to determine which side of the ref the quote is on
    :return:
    """
    after_indicators = [u"דכתיב", u"ודכתיב", u"וכתיב", u"וכתוב", u"שכתוב", u"כשכתוב", u"כדכתיב", u"זל", u"ז״ל", u"ז''ל",
                       u"ז\"ל", u"אומרם", u"כאמור", u"ואומר", u"אמר", u"שנאמר", u"בגמ'", u"בגמ׳", u"בפסוק", u"לעיל", u"ולעיל", u"לקמן", u"ולקמן", u"בירושלמי",
                       u"בבבלי", u"שדרשו", u"ששנינו", u"שנינו", u"ושנינו", u"דשנינו", u"כמש״כ", u"כמש\"כ", u"כמ״ש", u"כמ\"ש",
                       u"וכמש״כ", u"וכמ\"ש", u"וכמ״ש", u"וכמש\"כ", u"ע״ה", u"ע\"ה", u"מבואר", u"כמבואר", u"במתני׳",
                       u"במתנ\'", u"דתנן", u"זכרונם לברכה", u"זכר לברכה"]
    after_reg = ur"(?:^|\s)(?:{})\s*[(\[]?$".format(u"|".join(after_indicators))
    after_indicators_far = [u"דבפרק", u"בפרק", u"שבפרק", u"פרק"]
    after_far_reg = ur"(?:^|\s)(?{}:)(?=\s|$)".format(u"|".join(after_indicators_far))
    after_indicators_after = [u"בד״ה", u"בד\"ה", u"ד״ה", u"ד\"ה"]
    after_after_reg = ur"^\s*(?:{})\s".format(u"|".join(after_indicators_after))
    punctuation = [u",", u".", u":", u"?", u"!", u"׃"]
    punctuation_after_reg = ur"^\s*(?:{})\s".format(u"|".join(punctuation))
    punctuation_before_reg = ur"(?:{})\s*$".format(u"|".join(punctuation))
    after_indicators_after_far = [u"וגו׳", u"וגו'", u"וגו", u"וכו׳", u"וכו'", u"וכו"]
    after_after_far_reg = ur"(?:^|\s)(?{}:)(?=\s|$)".format(u"|".join(after_indicators_after_far))
    found_title = found.index.get_title("he")
    found_node = library.get_schema_node(found_title, "he")
    title_nodes = {t: found_node for t in found.index.all_titles("he")}
    all_reg = library.get_multi_title_regex_string(set(found.index.all_titles("he")), "he")
    reg = regex.compile(all_reg, regex.VERBOSE)
    source_text = re.sub(ur"<[^>]+>", u"", strip_cantillation(source_tc.text, strip_vowels=True))

    linkified = library._wrap_all_refs_in_string(title_nodes, reg, source_text, "he")

    snippets = []
    found_normal = found.normal()
    found_section_normal = re.match(ur"^[^:]+", found_normal).group()
    for match in re.finditer(u"(<a [^>]+>)([^<]+)(</a>)", linkified):
        ref = get_tc(match.group(2), True)
        if ref.normal() == found_section_normal or ref.normal() == found_normal:
            if return_matches:
                snippets += [match]
            else:
                start_snip_naive = match.start(1) - snip_size if match.start(1) >= snip_size else 0
                start_snip_space = linkified.rfind(u" ", 0, start_snip_naive)
                start_snip_link = linkified.rfind(u"</a>", 0, match.start(1))
                start_snip = max(start_snip_space, start_snip_link)
                if start_snip == -1:
                    start_snip = start_snip_naive
                end_snip_naive = match.end(3) + snip_size if match.end(3) + snip_size <= len(linkified) else len(linkified)
                end_snip_space = linkified.find(u" ", end_snip_naive)
                end_snip_link = linkified.find(u"<a ", match.end(3))
                end_snip = min(end_snip_space, end_snip_link)
                if end_snip == -1:
                    end_snip = end_snip_naive

                if use_indicator_words:
                    before_snippet = linkified[start_snip:match.start(1)]
                    if u"ירושלמי" in before_snippet[-20:] and (len(ref.index.categories) < 2 or ref.index.categories[1] != u'Yerushalmi'):
                        # this guys not a yerushalmi but very likely should be
                        continue
                    after_snippet = linkified[match.end(3):end_snip]
                    if re.search(after_reg, before_snippet) is not None:
                        temp_snip = after_snippet
                        # print before_snippet
                    else:
                        temp_snip = linkified[start_snip:end_snip]
                else:
                    temp_snip = linkified[start_snip:end_snip]
                snippets += [re.sub(ur"<[^>]+>", u"", temp_snip)]

    if len(snippets) == 0:
        if must_find_snippet:
            return None
        return [source_text]

    return snippets
Ejemplo n.º 35
0
def tag_testing_naive(text_name,bib_links,seg_len_list,word_list_in,ref_list,test_set_name="test"):
    cal_dh_root = "../../dibur_hamatchil/dh_source_scripts/cal_matcher_output"
    jba_count = 0
    curr_state = "" #state should be retained, even b/w dafs
    #caldb_words = json.load(codecs.open("caldb_words_{}.json".format(text_name), "r", encoding="utf-8"))
    for iref,ref in enumerate(ref_list):
        curr_seg_len_list = seg_len_list[iref]
        curr_bib_links = bib_links[iref]
        curr_word_list_in = word_list_in[iref]

        daf = ref.__str__().replace("{} ".format(text_name),"").encode('utf8')

        try:
            cal_pre_tagged_words = \
            json.load(codecs.open("{}/{}/lang_naive_talmud/lang_naive_talmud_{}.json".format(cal_dh_root,text_name,daf), "r", encoding="utf8"))
        except IOError:
            cal_pre_tagged_words = None

        jbaforms = json.load(codecs.open("JBAHashtable.json","rb",encoding='utf8'))

        word_list_out = []
        count = 0
        main_i = 0

        while main_i < len(curr_seg_len_list):
            seg_len = curr_seg_len_list[main_i]
            bib_linkset = curr_bib_links[main_i]
            seg = curr_word_list_in[count:count+seg_len]
            count += seg_len

            b_start = -1; b_end = -1
            if len(bib_linkset) > 0:
                for bib_link in bib_linkset:
                    #there is an assumption here that the links to Tanakh are always 1
                    try:
                        bib_seg = tokenize_words(hebrew.strip_cantillation(Ref(bib_link.refs[1]).text('he').as_string(),strip_vowels=True),strip_html=True)
                        b_start,b_end = match_segments(seg, bib_seg)
                    except InputError:
                        continue
            for i,word in enumerate(seg):
                state_switch_pat = re.compile(r"\<big\>\<strong\>[^\<\>]+\</strong\>\</big\>")
                if re.match(state_switch_pat,word):
                    if curr_state == "mishnaic":
                        curr_state = "talmudic"
                    elif curr_state == "talmudic" or curr_state == "":
                        curr_state = "mishnaic"

                cal_obj = None
                if b_start != -1 and b_end != -1 and i in xrange(b_start,b_end):
                    lang = "biblical"
                elif curr_state == "talmudic":
                    #lang = cal_pre_tagged_words[count-seg_len+i]["class"]
                    if not cal_pre_tagged_words is None:
                        try:
                            cal_obj = cal_pre_tagged_words["words"][count-seg_len+i]
                            if cal_obj["class"] == "unknown":
                                if word in jbaforms and len(jbaforms[word]) == 1 and False:
                                    temp_cal_obj = jbaforms[word][0].copy()
                                    if temp_cal_obj["word"][-1] != "'" and temp_cal_obj["head_word"][-1] != "_":
                                        cal_obj = temp_cal_obj
                                        cal_obj["jba_word"] = cal_obj["word"]
                                        cal_obj["word"] = word
                                        cal_obj["class"] = "talmud"
                                        jba_count += 1
                        except IndexError:
                            break
                elif curr_state == "mishnaic":
                    lang = "mishnaic"
                else:
                    lang = "unknown"

                if cal_obj:
                    word_list_out.append(cal_obj)
                else:
                    word_list_out.append({"word":word,"class":lang})
            main_i += 1
        missed_words = [] if cal_pre_tagged_words is None else cal_pre_tagged_words["missed_words"]
        doc = {"words":word_list_out,"missed_words":missed_words}
        fp = codecs.open("{}/{}/test_set/{}_naive_{}.json".format(cal_dh_root,text_name,test_set_name,daf), "w", encoding='utf-8')
        json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)

    print "NUM JBA WORDS: {}".format(jba_count)