Ejemplo n.º 1
0
    def find_refs(self, text):
        begin = -1
        end = -1
        for i, letter in enumerate(text):
            if begin == -1 and is_hebrew(letter):
                begin = i
            if begin != -1 and not is_hebrew(letter):
                end = i
                print text[begin:end]
                begin = -1
                end = -1

        return True
Ejemplo n.º 2
0
 def is_hebrew(self):
     """Returns True if this sheet appears to be in Hebrew according to its title"""
     from sefaria.utils.hebrew import is_hebrew
     import regex
     title = strip_tags(self.title)
     # Consider a sheet Hebrew if its title contains Hebrew character but no English characters
     return is_hebrew(title) and not regex.search(u"[a-z|A-Z]", title)
def add_more_mishnah_titles():
    from sefaria.utils.hebrew import is_hebrew, strip_cantillation
    with open(
            "/home/nss/sefaria/datasets/ner/sefaria/temp/Rabbis in Mishnah Corrections - cross_validated_by_language.csv",
            "r") as fin:
        c = csv.DictReader(fin)
        for row in c:
            # TODO deal with mistakes
            if row['Error Type (rabbi, title, mistake, correct, skip)'] != 'title':
                continue
            new_title = strip_cantillation(row['Missing Title'],
                                           strip_vowels=True)
            if new_title == 'TYPO':
                continue
            slug = row['Missing Title Slug']
            if len(slug) == 0:
                print('NO MISSING TITLE SLUG', row)
                continue
            if slug.startswith('BONAYICH'):
                continue
            t = Topic.init(slug)
            if t is None:
                print("NO TOPIC FOR SLUG", slug, row)
                continue

            if len(new_title) == 0:
                print("ZERO LEN NEW TITLE", row)
                continue
            t.titles += [{
                "text": new_title,
                "lang": "he" if is_hebrew(new_title) else "en"
            }]
            t.save()
Ejemplo n.º 4
0
def bulktext_api(request, refs):
    """
    Used by the linker.
    :param request:
    :param refs:
    :return:
    """
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        useTextFamily = request.GET.get("useTextFamily", None)
        refs = set(refs.split("|"))
        res = {}
        for tref in refs:
            try:
                oref = model.Ref(tref)
                lang = "he" if is_hebrew(tref) else "en"
                if useTextFamily:
                    text_fam = model.TextFamily(oref,
                                                commentary=0,
                                                context=0,
                                                pad=False)
                    he = text_fam.he
                    en = text_fam.text
                    res[tref] = {
                        'he': he,
                        'en': en,
                        'lang': lang,
                        'ref': oref.normal(),
                        'primary_category':
                        text_fam.contents()['primary_category'],
                        'heRef': oref.he_normal(),
                        'url': oref.url()
                    }
                else:
                    he = model.TextChunk(oref, "he").text
                    en = model.TextChunk(oref, "en").text
                    res[tref] = {
                        'he':
                        he if isinstance(he, basestring) else
                        JaggedTextArray(he).flatten_to_string(
                        ),  # these could be flattened on the client, if need be.
                        'en':
                        en if isinstance(en, basestring) else
                        JaggedTextArray(en).flatten_to_string(),
                        'lang':
                        lang,
                        'ref':
                        oref.normal(),
                        'heRef':
                        oref.he_normal(),
                        'url':
                        oref.url()
                    }
            except (InputError, ValueError, AttributeError, KeyError) as e:
                # referer = request.META.get("HTTP_REFERER", "unknown page")
                # This chatter fills up the logs.  todo: put in it's own file
                # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
                res[tref] = {"error": 1}
        resp = jsonResponse(res, cb)
        return resp
Ejemplo n.º 5
0
 def _do_search(self):
     lang = "he" if is_hebrew(self._needle) else "en"
     reg_str = m.library.get_regex_string(
         self._needle, lang, for_js=True, anchored=False, capture_title=False, parentheses=self._with_parenthesis)
     reg = re.compile(reg_str, re.VERBOSE)
     match = reg.search(self._haystack)
     return match
Ejemplo n.º 6
0
    def _single_lookup(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and forms.count() == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        if forms.count() > 0:
            result = []
            headword_query = []
            for form in forms:
                for lookup in form.lookups:
                    headword_query.append({'headword': lookup['headword']})
                    # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms
            return headword_query
        else:
            return []
Ejemplo n.º 7
0
def title_regex_api(request, titles):
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        parentheses = bool(int(request.GET.get("parentheses", False)))
        titles = set(titles.split("|"))
        res = {}
        errors = []
        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            try:
                re_string = model.library.get_regex_string(
                    title,
                    lang,
                    anchored=False,
                    for_js=True,
                    parentheses=parentheses)
                res[title] = re_string
            except (AttributeError, AssertionError) as e:
                # There are normal errors here, when a title matches a schema node, the chatter fills up the logs.
                # logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}.  {}".format(title, e))
                errors.append("{} : {}".format(title, e))
        if len(errors):
            res["error"] = errors
        resp = jsonResponse(res, cb)
        return resp
    else:
        return jsonResponse({"error": "Unsupported HTTP method."})
Ejemplo n.º 8
0
    def test_regex_string_he_in_parentheses_only(self):
        st1 = '(ובויקרא כ"ה)'
        st2 = 'ובויקרא כ"ה'
        title = 'ויקרא'

        lang = "he" if is_hebrew(title) else "en"
        res = m.library.get_regex_string(title,
                                         lang,
                                         for_js=True,
                                         anchored=False,
                                         capture_title=False,
                                         parentheses=True)
        res_no_comments = re.sub('\s+', '', re.sub('\s*?#.*?\n', '', res))

        match = re.search(res_no_comments, st1)
        match_string = '' if not match else match.group().replace(
            match.group(1), '')
        resp = requests.get(
            "https://www.sefaria.org.il/{}".format(match_string))
        assert resp.status_code == 200

        match = re.search(res_no_comments, st2)
        match_string = 'no match' if not match else match.group().replace(
            match.group(1), '')
        resp = requests.get(
            "https://www.sefaria.org.il/{}".format(match_string))
        assert resp.status_code == 404
def check_rabi_rav_results():
    from research.knowledge_graph.named_entity_recognition.ner_tagger import TextNormalizer
    from sefaria.utils.hebrew import is_hebrew

    with open(f"{DATASET_LOC}/Fix Rabi and Rav Errors - rav_rabbi_errors.csv",
              "r") as fin:
        c = csv.DictReader(fin)
        rows = list(c)

    # check titles appear in text
    for row in rows:
        typ = row['Error Type (rabbi, title, mistake, correct)']
        is_heb = is_hebrew(row['Snippet'])
        text = TextNormalizer.normalize_text('he' if is_heb else 'en',
                                             row['Snippet'].replace('~', ''))
        if typ == 'title':
            title = row['Missing Title']
        elif typ == 'rabbi':
            title = row[f"Missing Rabbi {'Hebrew' if is_heb else 'English'}"]
        else:
            continue

        title_reg = TextNormalizer.get_rabbi_regex(
            TextNormalizer.myunidecode(title.strip()))
        m = re.search(title_reg, text)
        if not m:
            if typ == 'rabbi' and len(row['Missing Rabbi Title in Text']) > 0:
                title = row['Missing Rabbi Title in Text']
                title_reg = TextNormalizer.get_rabbi_regex(
                    TextNormalizer.myunidecode(title.strip()))
                m = re.search(title_reg, text)
                if not m:
                    print(f"MISSED '{title}':", text, row['Ref'])
            else:
                print(f"MISSED '{title}':", text, row['Ref'])
Ejemplo n.º 10
0
def bulktext_api(request, refs):
    """
    Used by the linker.
    :param request:
    :param refs:
    :return:
    """
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        refs = set(refs.split("|"))
        res = {}
        for tref in refs:
            try:
                oref = model.Ref(tref)
                lang = "he" if is_hebrew(tref) else "en"
                he = model.TextChunk(oref, "he").text
                en = model.TextChunk(oref, "en").text
                res[tref] = {
                    'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string(),  # these could be flattened on the client, if need be.
                    'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(),
                    'lang': lang,
                    'ref': oref.normal(),
                    'heRef': oref.he_normal(),
                    'url': oref.url()
                }
            except (InputError, ValueError, AttributeError) as e:
                referer = request.META.get("HTTP_REFERER", "unknown page")
                logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
                res[tref] = {"error": 1}
        resp = jsonResponse(res, cb)
        resp['Access-Control-Allow-Origin'] = '*'
        return resp
Ejemplo n.º 11
0
def extract_form_tuples(csv_row):
    forms = [(csv_row[0].strip(), 'eng'), (csv_row[1].strip(), 'heb')]
    forms += [(x.strip(), 'eng') for x in csv_row[2].split(",") if len(x)]
    forms += [(x.strip(), 'heb') for x in csv_row[3].split(",") if len(x)]
    forms += [(x.strip(), 'heb' if is_hebrew(x) else 'eng')
              for x in csv_row[4].split(",") if len(x)]
    return forms
def add_langs_to_topics(topic_list: list, use_as_typed=True, backwards_compat_lang_fields: dict = None) -> list:
	"""
	adds primary en and he to each topic in topic_list and returns new topic_list
	:param list topic_list: list of topics where each item is dict of form {'slug': required, 'asTyped': optional}
	:param dict backwards_compat_lang_fields: of shape {'en': str, 'he': str}. Defines lang fields for backwards compatibility. If None, ignore.
	:param bool use_as_typed:
	"""
	new_topic_list = []
	if len(topic_list) > 0:
		topic_set = {topic.slug: topic for topic in TopicSet({'$or': [{'slug': topic['slug']} for topic in topic_list]})}
		for topic in topic_list:
			topic_obj = topic_set.get(topic['slug'], None)
			if topic_obj is None:
				continue
			new_topic = topic.copy()
			tag_lang = 'en'
			if use_as_typed:
				tag_lang = 'he' if is_hebrew(new_topic['asTyped']) else 'en'
				new_topic[tag_lang] = new_topic['asTyped']
			if not use_as_typed or tag_lang == 'en':
				new_topic['he'] = topic_obj.get_primary_title('he')
			if not use_as_typed or tag_lang == 'he':
				new_topic['en'] = topic_obj.get_primary_title('en')

			if backwards_compat_lang_fields is not None:
				for lang in ('en', 'he'):
					new_topic[backwards_compat_lang_fields[lang]] = new_topic[lang]
			new_topic_list += [new_topic]

	return new_topic_list
Ejemplo n.º 13
0
def add_langs_to_topics(topic_list: list, use_as_typed=True, backwards_compat_lang_fields: dict = None) -> list:
	"""
	adds primary en and he to each topic in topic_list and returns new topic_list
	:param list topic_list: list of topics where each item is dict of form {'slug': required, 'asTyped': optional}
	:param dict backwards_compat_lang_fields: of shape {'en': str, 'he': str}. Defines lang fields for backwards compatibility. If None, ignore.
	:param bool use_as_typed:
	"""
	new_topic_list = []
	from sefaria.model import library
	topic_map = library.get_topic_mapping()
	if len(topic_list) > 0:
		for topic in topic_list:
			# Fall back on `asTyped` if no data is in mapping yet. If neither `asTyped` nor mapping data is availble fail safe by reconstructing a title from a slug (HACK currently affecting trending topics if a new topic isn't in cache yet)
			default_title = topic['asTyped'] if use_as_typed else topic['slug'].replace("-", " ").title()
			topic_titles = topic_map.get(topic['slug'], {"en": default_title, "he": default_title})
			new_topic = topic.copy()
			tag_lang = 'en'
			if use_as_typed:
				tag_lang = 'he' if is_hebrew(new_topic['asTyped']) else 'en'
				new_topic[tag_lang] = new_topic['asTyped']
			if not use_as_typed or tag_lang == 'en':
				new_topic['he'] = topic_titles["he"]
			if not use_as_typed or tag_lang == 'he':
				new_topic['en'] = topic_titles["en"]

			if backwards_compat_lang_fields is not None:
				for lang in ('en', 'he'):
					new_topic[backwards_compat_lang_fields[lang]] = new_topic[lang]
			new_topic_list += [new_topic]

	return new_topic_list
Ejemplo n.º 14
0
def bulktext_api(request, refs):
    """
    Used by the linker.
    :param request:
    :param refs:
    :return:
    """
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        refs = set(refs.split("|"))
        res = {}
        for tref in refs:
            try:
                oref = model.Ref(tref)
                lang = "he" if is_hebrew(tref) else "en"
                he = model.TextChunk(oref, "he").text
                en = model.TextChunk(oref, "en").text
                res[tref] = {
                    'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string(),  # these could be flattened on the client, if need be.
                    'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(),
                    'lang': lang,
                    'ref': oref.normal(),
                    'heRef': oref.he_normal(),
                    'url': oref.url()
                }
            except (InputError, ValueError, AttributeError) as e:
                referer = request.META.get("HTTP_REFERER", "unknown page")
                logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
                res[tref] = {"error": 1}
        resp = jsonResponse(res, cb)
        resp['Access-Control-Allow-Origin'] = '*'
        return resp
Ejemplo n.º 15
0
	def is_hebrew(self):
		"""Returns True if this sheet appears to be in Hebrew according to its title"""
		from sefaria.utils.hebrew import is_hebrew
		import regex
		title = strip_tags(self.title)
		# Consider a sheet Hebrew if its title contains Hebrew character but no English characters
		return is_hebrew(title) and not regex.search(u"[a-z|A-Z]", title)
Ejemplo n.º 16
0
    def _single_lookup(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            """if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'"""
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        form = WordForm().load(query_obj)
        if not form and lookup_ref:
            del query_obj["refs"]
            form = WordForm().load(query_obj)
        if form:
            result = []
            headword_query = []
            for lookup in form.lookups:
                headword_query.append({'headword': lookup['headword']})
                # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms
            return headword_query
        else:
            return []
Ejemplo n.º 17
0
def get_sheet_language(sheet):
    """
	Returns the language we believe `sheet` to be written in,
	based on the language of its title.
	"""
    title = strip_tags(sheet.get("title", "")).replace("(Copy)",
                                                       "").replace("\n", " ")
    return "hebrew" if is_hebrew(title, heb_only=True) else "english"
def create_topic_from_title(title):
	topic = Topic({
		"slug": Topic.normalize_slug(title),
		"titles": [{
			"text": title,
			"lang": "he" if is_hebrew(title) else "en",
		"primary": True,
		}]
	})
	topic.save()
	return topic
def import_rabi_rav_rabbis_into_topics():
    from sefaria.utils.hebrew import is_hebrew
    with open(f"{DATASETS_NAMED_ENTITY_LOC}/new_rabbis.json", "r") as fin:
        j = json.load(fin)
    TopicSet({'alt_ids.rav_rabi': {"$exists": True}}).delete()
    for _, d in j.items():
        d['alt_ids'] = {"rav_rabi": True}
        typ = d['type']
        del d['type']
        t = Topic(d)
        t.save()
        toTopic = "mishnaic-people" if typ == "tanna" else "talmudic-people"
        link_json = {
            "class": "intraTopic",
            "fromTopic": t.slug,
            "toTopic": toTopic,
            "linkType": "is-a",
            "dataSource": "sperling-bonayich"
        }
        itl = IntraTopicLink(link_json)
        try:
            itl.save()
        except sefaria.system.exceptions.DuplicateRecordError:
            print("Duplicate", t.slug, toTopic)

    with open(
            f"{DATASETS_NAMED_ENTITY_LOC}/Fix Rabi and Rav Errors - rav_rabbi_errors.csv",
            "r") as fin:
        c = csv.DictReader(fin)
        rows = list(c)
    for row in rows:
        typ = row['Error Type (rabbi, title, mistake, correct)']
        is_heb = is_hebrew(row['Snippet'])

        if typ == 'title':
            slug_list = [row['Missing Title Slug']]
            other_slugs = row['Additional Missing Title Slugs']
            if len(other_slugs) > 0:
                slug_list += other_slugs.split(', ')
            topic_list = [Topic.init(slug.lower()) for slug in slug_list]
            for t, s in zip(topic_list, slug_list):
                if not t:
                    print("NO TOPIC", s)
                    continue
                has_title = False
                for tit in t.titles:
                    if tit['text'] == row['Missing Title']:
                        has_title = True
                        break
                if has_title:
                    continue
                t.add_title(row['Missing Title'], 'he' if is_heb else 'en')
                t.save()
Ejemplo n.º 20
0
 def finds_multiple(self, result):
     lang = "he" if is_hebrew(self._needle) else "en"
     for title_match in m.library.all_titles_regex(lang, citing_only=False).finditer(self._haystack):
         match = self._do_search(self._needle, self._haystack[title_match.start():])
         if not match:
             return False
         if m.Ref(match.group(1)).normal() in result:
             return True
         else:
             print("Mismatched.  Found: {}, which normalizes to: {}, which is not in {}".format(match.group(1),
                                                                                    m.Ref(match.group(1)).normal(),
                                                                                    result))
             return False
Ejemplo n.º 21
0
    def test_regex_string_he_js_with_prefix(self):
        st = 'ובויקרא כ"ה'
        title = 'ויקרא'

        lang = "he" if is_hebrew(title) else "en"
        reg_str = m.library.get_regex_string(title,
                                             lang,
                                             for_js=True,
                                             anchored=False,
                                             capture_title=False)
        reg = re.compile(reg_str, re.VERBOSE)
        match = reg.search(st)
        assert m.Ref(match.group(1)).normal() == "Leviticus 25"
Ejemplo n.º 22
0
    def test_regex_string_en_js(self):
        st = 'Ruth 1 1'
        title = 'Ruth'

        lang = "he" if is_hebrew(title) else "en"
        reg_str = m.library.get_regex_string(title,
                                             lang,
                                             for_js=True,
                                             anchored=False,
                                             capture_title=False)
        reg = re.compile(reg_str, re.VERBOSE)
        match = reg.search(st)
        assert m.Ref(match.group(1)).normal() == "Ruth 1:1"
Ejemplo n.º 23
0
def bundle_many_texts(refs, useTextFamily=False, as_sized_string=False, min_char=None, max_char=None):
    res = {}
    for tref in refs:
        try:
            oref = model.Ref(tref)
            lang = "he" if is_hebrew(tref) else "en"
            if useTextFamily:
                text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False)
                he = text_fam.he
                en = text_fam.text
                res[tref] = {
                    'he': he,
                    'en': en,
                    'lang': lang,
                    'ref': oref.normal(),
                    'primary_category': text_fam.contents()['primary_category'],
                    'heRef': oref.he_normal(),
                    'url': oref.url()
                }
            else:
                he_tc = model.TextChunk(oref, "he")
                en_tc = model.TextChunk(oref, "en")
                if as_sized_string:
                    kwargs = {}
                    if min_char:
                        kwargs['min_char'] = min_char
                    if max_char:
                        kwargs['max_char'] = max_char
                    he_text = he_tc.as_sized_string(**kwargs)
                    en_text = en_tc.as_sized_string(**kwargs)
                else:
                    he = he_tc.text
                    en = en_tc.text
                    # these could be flattened on the client, if need be.
                    he_text = he if isinstance(he, str) else JaggedTextArray(he).flatten_to_string()
                    en_text = en if isinstance(en, str) else JaggedTextArray(en).flatten_to_string()

                res[tref] = {
                    'he': he_text,
                    'en': en_text,
                    'lang': lang,
                    'ref': oref.normal(),
                    'heRef': oref.he_normal(),
                    'url': oref.url()
                }
        except (InputError, ValueError, AttributeError, KeyError) as e:
            # referer = request.META.get("HTTP_REFERER", "unknown page")
            # This chatter fills up the logs.  todo: put in it's own file
            # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
            res[tref] = {"error": 1}
    return res
def add_titles():
    with open(f'{ROOT}/{new_titles}', 'r') as fin:
        cin = csv.DictReader(fin)
        for row in cin:
            slug = row['Slug']
            if 'BONAYICH' in slug: continue
            t = Topic.init(slug)
            if t is None:
                print('Slug is None', slug)
                continue
            new_title = normalize_title(row['New title 1'])
            lang = 'he' if is_hebrew(new_title) else 'en'
            t.add_title(new_title, lang)
            t.save()
Ejemplo n.º 25
0
    def test_regex_string_en_js(self):
        st = 'Ruth 1 1'
        title = 'Ruth'

        lang = "he" if is_hebrew(title) else "en"
        res = m.library.get_regex_string(title,
                                         lang,
                                         for_js=True,
                                         anchored=False,
                                         capture_title=False)
        match = re.search(res, st)
        match_string = match.group(
        )  # 'no match' if not match else match.group()
        resp = requests.get(
            "https://www.sefaria.org.il/{}".format(match_string))
        assert resp.status_code == 200
Ejemplo n.º 26
0
def bulktext_api(request, refs):
    """
    Used by the linker.
    :param request:
    :param refs:
    :return:
    """
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        useTextFamily = request.GET.get("useTextFamily", None)
        refs = set(refs.split("|"))
        res = {}
        for tref in refs:
            try:
                oref = model.Ref(tref)
                lang = "he" if is_hebrew(tref) else "en"
                if useTextFamily:
                    text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False)
                    he = text_fam.he
                    en = text_fam.text
                    res[tref] = {
                        'he': he,
                        'en': en,
                        'lang': lang,
                        'ref': oref.normal(),
                        'primary_category': text_fam.contents()['primary_category'],
                        'heRef': oref.he_normal(),
                        'url': oref.url()
                    }
                else:
                    he = model.TextChunk(oref, "he").text
                    en = model.TextChunk(oref, "en").text
                    res[tref] = {
                        'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string(),  # these could be flattened on the client, if need be.
                        'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(),
                        'lang': lang,
                        'ref': oref.normal(),
                        'heRef': oref.he_normal(),
                        'url': oref.url()
                    }
            except (InputError, ValueError, AttributeError, KeyError) as e:
                # referer = request.META.get("HTTP_REFERER", "unknown page")
                # This chatter fills up the logs.  todo: put in it's own file
                # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
                res[tref] = {"error": 1}
        resp = jsonResponse(res, cb)
        return resp
Ejemplo n.º 27
0
    def test_regex_string_he_in_parentheses_only(self):
        st1 = '(ובויקרא כ"ה)'
        st2 = 'ובויקרא כ"ה'
        title = 'ויקרא'

        lang = "he" if is_hebrew(title) else "en"
        reg_str = m.library.get_regex_string(title,
                                             lang,
                                             for_js=True,
                                             anchored=False,
                                             capture_title=False,
                                             parentheses=True)
        reg = re.compile(reg_str, re.VERBOSE)
        match = reg.search(st1)
        assert m.Ref(match.group(1)).normal() == "Leviticus 25"

        match = reg.search(st1)
        assert m.Ref(match.group(1)).normal() == "Leviticus 25"
 def parse_titles(self, element):
     title = element.get("text")
     # print title
     #title = re.sub(ur"</b>|<b>|#.*#|'", u"", title)
     title = self.comment_strip_re.sub(u"", title)
     spl_title = title.split(self.title_lang_delim)
     titles = {}
     if len(spl_title) == 2:
         he_pos = 1 if is_hebrew(spl_title[1]) else 0
         he = spl_title[he_pos].split(self.alt_title_delim)
         titles["hePrim"] = he[0].strip()
         titles["heAltList"] = [t.strip() for t in he[1:]]
         del spl_title[he_pos]
     en = spl_title[0].split(self.alt_title_delim)
     titles["enPrim"] = en[0].strip()
     titles["enAltList"] = [t.strip() for t in en[1:]]
     # print node.attrib
     return titles
Ejemplo n.º 29
0
    def test_regex_string_he_in_parentheses(self):
        st3 = '(בדברים לב ובספרות ג ב)'
        titles = ['דברים', 'רות']

        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            reg_str = m.library.get_regex_string(title,
                                                 lang,
                                                 for_js=True,
                                                 anchored=False,
                                                 capture_title=False,
                                                 parentheses=True)
            reg = re.compile(reg_str, re.VERBOSE)

            match = reg.search(st3)
            if title == 'דברים':
                assert m.Ref(match.group(1)).normal() == "Deuteronomy 32"
            else:
                assert match is None
Ejemplo n.º 30
0
def title_regex_api(request, titles):
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        titles = set(titles.split("|"))
        res = {}
        errors = []
        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            try:
                re_string = model.library.get_regex_string(title, lang, for_js=True)
                res[title] = re_string
            except (AttributeError, AssertionError) as e:
                logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}.  {}".format(title, e))
                errors.append(u"{} : {}".format(title, e))
        if len(errors):
            res["error"] = errors
        resp = jsonResponse(res, cb)
        resp['Access-Control-Allow-Origin'] = '*'
        return resp
Ejemplo n.º 31
0
def title_regex_api(request, titles):
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        titles = set(titles.split("|"))
        res = {}
        errors = []
        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            try:
                re_string = model.library.get_regex_string(title, lang, for_js=True)
                res[title] = re_string
            except (AttributeError, AssertionError) as e:
                logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}.  {}".format(title, e))
                errors.append(u"{} : {}".format(title, e))
        if len(errors):
            res["error"] = errors
        resp = jsonResponse(res, cb)
        resp['Access-Control-Allow-Origin'] = '*'
        return resp
Ejemplo n.º 32
0
    def test_regex_string_he_in_parentheses_3(self):
        st3 = '<p>[שיר השירים א ירושלמי כתובות (דף כח:) בשורות א]'
        titles = ['ירושלמי כתובות', 'שיר השירים']

        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            reg_str = m.library.get_regex_string(title,
                                                 lang,
                                                 for_js=True,
                                                 anchored=False,
                                                 capture_title=False,
                                                 parentheses=True)
            reg = re.compile(reg_str, re.VERBOSE)
            match = reg.search(st3)
            if title == "ירושלמי כתובות":
                assert m.Ref(
                    match.group(1)).normal() == "Jerusalem Talmud Ketubot 28b"
            else:
                assert m.Ref(match.group(1)).normal() == "Song of Songs 1"
Ejemplo n.º 33
0
def title_regex_api(request, titles):
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        titles = set(titles.split("|"))
        res = {}
        errors = []
        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            try:
                re_string = model.library.get_regex_string(title, lang, anchored=False, for_js=True)
                res[title] = re_string
            except (AttributeError, AssertionError) as e:
                # There are normal errors here, when a title matches a schema node, the chatter fills up the logs.
                # logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}.  {}".format(title, e))
                errors.append(u"{} : {}".format(title, e))
        if len(errors):
            res["error"] = errors
        resp = jsonResponse(res, cb)
        return resp
Ejemplo n.º 34
0
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
 def parse_titles(self, element):
     title = element.get("text")
     if '**default**' in title:
         return None
     # print title
     #title = re.sub(ur"</b>|<b>|#.*#|'", u"", title)
     title = self.comment_strip_re.sub("", title)
     spl_title = title.split(self.title_lang_delim)
     titles = {}
     if len(spl_title) == 2:
         he_pos = 1 if is_hebrew(spl_title[1]) else 0
         he = spl_title[he_pos].split(self.alt_title_delim)
         titles["hePrim"] = he[0].strip()
         titles["heAltList"] = [t.strip() for t in he[1:]]
         del spl_title[he_pos]
     en = spl_title[0].split(self.alt_title_delim)
     titles["enPrim"] = en[0].strip()
     titles["enAltList"] = [t.strip() for t in en[1:]]
     # print node.attrib
     return titles
Ejemplo n.º 36
0
    def test_regex_string_he_in_parentheses_3(self):
        st3 = '<p>[שיר השירים א ירושלמי כתובות (דף כח:) בשורות א]'
        titles = ['ירושלמי כתובות', 'שיר השירים']

        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            res = m.library.get_regex_string(title,
                                             lang,
                                             for_js=True,
                                             anchored=False,
                                             capture_title=False,
                                             parentheses=True)
            res_no_comments = re.compile(res, re.VERBOSE)
            match = res_no_comments.search(st3)
            match_string = 'no match' if not match else match.group()
            resp = requests.get(
                "https://www.sefaria.org.il/{}".format(match_string))
            assert resp.status_code == 200
            print(resp.url)
            assert resp.url == 'https://www.sefaria.org.il/Song_of_Songs.1' if title == 'שיר השירים' else 'https://www.sefaria.org.il/Jerusalem_Talmud_Ketubot.28b' if title == 'ירושלמי כתובות' else ''
Ejemplo n.º 37
0
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a
            # consonantal form was supplied in the first place, this optimizes queries.
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
Ejemplo n.º 38
0
    def test_regex_string_he_in_parentheses_1(self):
        st3 = '(בדברים לב ובספרות ג ב)'
        titles = ['דברים', 'רות']

        for title in titles:
            lang = "he" if is_hebrew(title) else "en"
            res = m.library.get_regex_string(title,
                                             lang,
                                             for_js=True,
                                             anchored=False,
                                             capture_title=False,
                                             parentheses=True)
            res_no_comments = re.sub('\s+', '', re.sub('\s*?#.*?\n', '', res))

            match = re.search(res_no_comments, st3)
            match_string = 'no match' if not match else match.group()
            resp = requests.get(
                "https://www.sefaria.org.il/{}".format(match_string))
            assert resp.status_code == 200 if title == 'דברים' else 404
            print(resp.url)
            assert resp.url == 'https://www.sefaria.org.il/Deuteronomy.32' if title == 'דברים' else 'https://www.sefaria.org.il/no%20match'
Ejemplo n.º 39
0
 def primary_name(self, lang):
     return [self.name] if (hebrew.is_hebrew(self.name) == (lang == "he")) else []
"""
Ensure that Hebrew and English Title variants are in the correct field.
"""
from sefaria.model import *
from sefaria.utils.hebrew import is_hebrew

indices = IndexSet({})
for index in indices:
    en = []
    he = []
    variants = index.titleVariants + getattr(index, "heTitleVariants", [])
    for variant in variants:
        if is_hebrew(variant):
            he.append(variant)
        else:
            en.append(variant)

    if set(index.titleVariants) != set(en):
        print index.title
        print index.titleVariants
        print en
        
    index.titleVariants   = list(set(en))
    index.heTitleVariants = list(set(he))
    index.save()
def extract_form_tuples(csv_row):
    forms = [(csv_row[0].strip(), 'eng'), (csv_row[1].strip(), 'heb')]
    forms += [(x.strip(), 'eng') for x in csv_row[2].split(",") if len(x)]
    forms += [(x.strip(), 'heb') for x in csv_row[3].split(",") if len(x)]
    forms += [(x.strip(), 'heb' if is_hebrew(x) else 'eng') for x in csv_row[4].split(",") if len(x)]
    return forms
Ejemplo n.º 42
0
def get_refs_in_string(st):
	"""
	Returns a list of valid refs found within text.
	"""
	lang = 'he' if is_hebrew(st) else 'en'

	titles = model.get_titles_in_string(st, lang)
	if not titles:
		return []

	if lang == "en":
		reg = "\\b(?P<ref>"
		reg += "(" + "|".join([re.escape(title) for title in titles]) + ")"
		reg += " \d+([ab])?([ .:]\d+)?([ .:]\d+)?(-\d+([ab])?([ .:]\d+)?)?" + ")\\b"
		reg = re.compile(reg)
	elif lang == "he":
		title_string = "|".join([re.escape(t) for t in titles])
		#Hebrew Unicode page: http://www.unicode.org/charts/PDF/U0590.pdf
		#todo: handle Ayin before Resh cases.
		#todo: This doesn't do ranges.  Do we see those in the wild?
		#todo: verify that open and closing parens are of the same type, so as not to fooled by (} or {)
		reg = ur"""(?<=										# look behind for opening brace
				[({{]										# literal '(', brace,
				[^}})]*										# anything but a closing ) or brace
			)
			(?P<ref>										# Capture the whole match as 'ref'
				({0})										# Any one book title, (Inserted with format(), below)
				\s+											# a space
				(\u05d3[\u05e3\u05e4\u05f3']\s+)?			# Daf, spelled with peh, peh sofit, geresh, or single quote
				(?:\u05e4(?:"|\u05f4|'')?)?				# Peh (for 'perek') maybe followed by a quote of some sort
				(?P<num1>									# the first number (1 of 3 styles, below)
					(?=\p{{Hebrew}}+(?:"|\u05f4|'')\p{{Hebrew}}) # (2: ") Lookahead:  At least one letter, followed by double-quote, two single quotes, or gershayim, followed by  one letter
						\u05ea*(?:"|\u05f4|'')?				# Many Tavs (400), maybe dbl quote
						[\u05e7-\u05ea]?(?:"|\u05f4|'')?	# One or zero kuf-tav (100-400), maybe dbl quote
						[\u05d8-\u05e6]?(?:"|\u05f4|'')?	# One or zero tet-tzaddi (9-90), maybe dbl quote
						[\u05d0-\u05d8]?					# One or zero alef-tet (1-9)															#
					|(?=\p{{Hebrew}})						# (3: no punc) Lookahead: at least one Hebrew letter
						\u05ea*								# Many Tavs (400)
						[\u05e7-\u05ea]?					# One or zero kuf-tav (100-400)
						[\u05d8-\u05e6]?					# One or zero tet-tzaddi (9-90)
						[\u05d0-\u05d8]?					# One or zero alef-tet (1-9)
					|\p{{Hebrew}}['\u05f3]					# (1: ') single letter, followed by a single quote or geresh
				)\s*										# end of the num1 group, maybe space
				[.:]?										# maybe a . for gemara refs or a : for tanach or gemara refs
				[,\s]*			    						# maybe a comma, maybe a space, maybe both
				(?:
					(?:\u05de\u05e9\u05e0\u05d4\s)			# Mishna spelled out, with a space after
					|(?:\u05de(?:"|\u05f4|'')?)				# or Mem (for 'mishna') maybe followed by a quote of some sort
				)?
				(?P<num2>									# second number - optional
					(?=\p{{Hebrew}}+(?:"|\u05f4|'')\p{{Hebrew}}) # (2: ") Lookahead:  At least one letter, followed by double-quote, two single quotes, or gershayim, followed by  one letter
						\u05ea*(?:"|\u05f4|'')?				# Many Tavs (400), maybe dbl quote
						[\u05e7-\u05ea]?(?:"|\u05f4|'')?	# One or zero kuf-tav (100-400), maybe dbl quote
						[\u05d8-\u05e6]?(?:"|\u05f4|'')?	# One or zero tet-tzaddi (9-90), maybe dbl quote
						[\u05d0-\u05d8]?					# One or zero alef-tet (1-9)															#
					|(?=\p{{Hebrew}})						# (3: no punc) Lookahead: at least one Hebrew letter
						\u05ea*								# Many Tavs (400)
						[\u05e7-\u05ea]?					# One or zero kuf-tav (100-400)
						[\u05d8-\u05e6]?					# One or zero tet-tzaddi (9-90)
						[\u05d0-\u05d8]?					# One or zero alef-tet (1-9)
					|\p{{Hebrew}}['\u05f3]					# (1: ') single letter, followed by a single quote or geresh
				)?[.:]?										# end of the num2 group, maybe a . or : for gemara refs
			)												# end of ref capture
			(?=												# look ahead for closing brace
				[^({{]*										# match of anything but an opening '(' or brace
				[)}}]										# zero-width: literal ')' or brace
			)
		""".format(title_string)

		reg = regex.compile(reg, regex.VERBOSE)

	matches = reg.findall(st)
	refs = [match[0] for match in matches]
	if len(refs) > 0:
		for ref in refs:
			logger.debug("get_refs_in_text: " + ref)
	return refs
Ejemplo n.º 43
0
 def test_is_hebrew(self):
     assert h.is_hebrew(u"ג")