Esempio n. 1
0
def suggest_people_duplicates():
    from data_utilities.util import WeightedLevenshtein
    norm2peeps = defaultdict(list)
    letter_counts = defaultdict(int)
    with open('/home/nss/sefaria/datasets/ner/sefaria/temp/yerushalmi_basic_en_titles.json', 'r') as fin:
        jin = json.load(fin)
        for person_obj in jin:
            title = person_obj['manualTitles'][0]['text']
            norm_title = title.replace('Rebbi', '').replace('Rav', '').replace('‘', "'").replace('ˋ', "'").replace('`', "'").replace('ї', 'i').replace('ï', 'i').replace('î', 'i').replace('ĩ', 'i').replace('ś', 's').replace('š', 's').replace('Š', 'S').replace('ü', 'u').lower().replace('ẓ', 'z').replace('\u0301', '').replace('\u0308', '')
            norm_title = ' '.join(norm_title.split())
            for char in norm_title:
                letter_counts[char] += 1
            norm2peeps[norm_title] += [title]
    max_letter_count = max(letter_counts.values())
    min_letter_count = min(letter_counts.values())
    letter_freqs = {char: (count-min_letter_count)/(max_letter_count-min_letter_count) for char, count in letter_counts.items()}
    leven = WeightedLevenshtein(letter_freqs)
    clusters = []
    for i, (norm_title, titles) in tqdm(enumerate(norm2peeps.items()), total=len(norm2peeps)):
        found_match = False
        for j, (norm_title2, titles2) in enumerate(list(norm2peeps.items())[i+1:]):
            score = leven.calculate(norm_title, norm_title2)
            if score > 85:
                print('MATCHED\n', norm_title, '\n', norm_title2, '\n', score)
                found_match = True
                norm2peeps[norm_title2] += titles  # merge clusters
                break
        if not found_match:
            clusters += [titles]
    with open('/home/nss/sefaria/datasets/ner/sefaria/temp/yerushalmi_people_duplicates.json', 'w') as fout:
        json.dump(clusters, fout, indent=2, ensure_ascii=False)
Esempio n. 2
0
 def __init__(self):
     self.levenshtein = WeightedLevenshtein()
     self.matcher = None
     try:
         with codecs.open("word_counts.json", "rb", encoding="utf8") as fin:
             self.word_counts = json.load(fin)
     except IOError:
         self.word_counts = {}
Esempio n. 3
0
 def __init__(self):
     self.stop_words = [
         u"ר'", u'רב', u'רבי', u'בן', u'בר', u'בריה', u'אמר', u'כאמר',
         u'וכאמר', u'דאמר', u'ודאמר', u'כדאמר', u'וכדאמר', u'ואמר', u'כרב',
         u'ורב', u'כדרב', u'דרב', u'ודרב', u'וכדרב', u'כרבי', u'ורבי',
         u'כדרבי', u'דרבי', u'ודרבי', u'וכדרבי', u"כר'", u"ור'", u"כדר'",
         u"דר'", u"ודר'", u"וכדר'", u'א״ר', u'וא״ר', u'כא״ר', u'דא״ר',
         u'דאמרי', u'משמיה', u'קאמר', u'קאמרי', u'לרב', u'לרבי', u"לר'",
         u'ברב', u'ברבי', u"בר'", u'הא', u'בהא', u'הך', u'בהך', u'ליה',
         u'צריכי', u'צריכא', u'וצריכי', u'וצריכא', u'הלל', u'שמאי', u"וגו'",
         u'וגו׳'
     ]
     self.levenshtein = WeightedLevenshtein()
Esempio n. 4
0
	for line in csv.reader(f):
		if line[0].startswith("Maskil"):
			ref, comm, found_ref, relevant_text = line
			rashi_ref = ref.replace("Maskil LeDavid, ", "Rashi on ")
			genesis_ref = ":".join(ref.replace("Maskil LeDavid, ", "").split(":")[:-1])
			if genesis_ref not in maskil_refs:
				maskil_refs[genesis_ref] = []
			maskil_refs[genesis_ref].append((ref, comm))
		else:
			new_csv.append(line)


exact_match = 0
more_than_2 = []
not_found = []
weighted = WeightedLevenshtein()
for genesis_ref in maskil_refs:
	these_maskil_refs = maskil_refs[genesis_ref]
	these_rashi_refs = Ref("Rashi on {}".format(genesis_ref)).all_segment_refs()

	rashi_dhs = [get_dh(strip_nekud(ref.text('he').text), ref) for ref in these_rashi_refs]
	for maskil_ref, maskil_comm in these_maskil_refs:
		finds = []
		for rashi_dh_tuple in rashi_dhs:
			rashi_ref, rashi_dh, rashi_dh_plus = rashi_dh_tuple
			rashi_dh = rashi_dh.replace(".", "")
			if maskil_comm.startswith(rashi_dh) or maskil_comm.startswith(rashi_dh.split()[0] + " "):
				finds.append((rashi_dh_tuple, maskil_ref, maskil_comm))

		if len(finds) == 1:
			exact_match += 1
Esempio n. 5
0
 def __init__(self):
     self.sheets = OrderedDict()
     self.sheet = None
     self.levenshtein = WeightedLevenshtein()
Esempio n. 6
0
class Nechama_parser(object):
    def __init__(self):
        self.sheets = OrderedDict()
        self.sheet = None
        self.levenshtein = WeightedLevenshtein()

    def bs4_reader(self, file_list_names):
        """
        The main BeautifulSoup reader function, that etrates on all sheets and creates the obj, probably should be in it's own file
        :param self:
        :return:
        """
        for html_sheet in file_list_names:
            content = BeautifulSoup(open("{}".format(html_sheet)), "lxml")
            print html_sheet
            top_dict = self.dict_from_html_attrs(
                content.find('div', {
                    'id': "contentTop"
                }).contents)
            # print 'len_content type ', len(top_dict.keys())
            self.sheet = Sheet(html_sheet, top_dict["paging"].text,
                               top_dict["h1"].text, top_dict["year"].text,
                               top_dict["pasuk"].text)
            self.sheets[html_sheet] = self.sheet
            body_dict = self.dict_from_html_attrs(
                content.find('div', {'id': "contentBody"}))
            self.sheet.sections.extend([
                v for k, v in body_dict.items()
                if re.search(u'ContentSection_\d', k)
            ])  # check that these come in in the right order
            self.sheet.sheet_remark = body_dict['sheetRemark'].text
            pass
        return self.sheets

    def dict_from_html_attrs(self, contents):
        d = OrderedDict()
        for e in [e for e in contents if isinstance(e, element.Tag)]:
            if "id" in e.attrs.keys():
                d[e.attrs['id']] = e
            else:
                d[e.name] = e
        return d

    def get_score(self, words_a, words_b):
        normalizingFactor = 100
        smoothingFactor = 1
        ImaginaryContenderPerWord = 22
        str_a = u" ".join(words_a)
        str_b = u" ".join(words_b)
        dist = self.levenshtein.calculate(str_a, str_b, normalize=False)
        score = 1.0 * (dist + smoothingFactor) / (
            len(str_a) + smoothingFactor) * normalizingFactor

        dumb_score = (ImaginaryContenderPerWord * len(words_a)) - score
        return dumb_score

    def clean(self, s):
        s = unicodedata.normalize("NFD", s)
        s = strip_cantillation(s, strip_vowels=True)
        s = re.sub(u"(^|\s)(?:\u05d4['\u05f3])($|\s)", u"\1יהוה\2", s)
        s = re.sub(ur"[,'\":?.!;־״׳]", u" ", s)
        s = re.sub(ur"\([^)]+\)", u" ", s)
        # s = re.sub(ur"\((?:\d{1,3}|[\u05d0-\u05ea]{1,3})\)", u" ", s)  # sefaria automatically adds pasuk markers. remove them
        s = bleach.clean(s, strip=True, tags=()).strip()
        s = u" ".join(s.split())
        return s

    def tokenizer(self, s):
        return self.clean(s).split()

    def get_score(self, words_a, words_b):
        normalizingFactor = 100
        smoothingFactor = 1
        ImaginaryContenderPerWord = 22
        str_a = u" ".join(words_a)
        str_b = u" ".join(words_b)
        dist = self.levenshtein.calculate(str_a, str_b, normalize=False)
        score = 1.0 * (dist + smoothingFactor) / (
            len(str_a) + smoothingFactor) * normalizingFactor

        dumb_score = (ImaginaryContenderPerWord * len(words_a)) - score
        return dumb_score

    def check_reduce_sources(self, comment, ref):
        n = len(re.split(u'\s+', comment))
        pm = ParallelMatcher(self.tokenizer,
                             dh_extract_method=None,
                             ngram_size=3,
                             max_words_between=4,
                             min_words_in_match=int(round(n * 0.8)),
                             min_distance_between_matches=0,
                             all_to_all=False,
                             parallelize=False,
                             verbose=False,
                             calculate_score=self.get_score)
        new_ref = pm.match(tc_list=[ref.text('he'), (comment, 1)],
                           return_obj=True)
        return new_ref
Esempio n. 7
0
 def __init__(self):
     self.stop_words = []
     self.levenshtein = WeightedLevenshtein()
Esempio n. 8
0
from sources.functions import *
from research.mesorat_hashas_sefaria.mesorat_hashas import ParallelMatcher
from research.link_disambiguator.main import Link_Disambiguator
import os
from functools import reduce
from sefaria.utils.hebrew import strip_cantillation
from data_utilities.dibur_hamatchil_matcher import get_maximum_dh, ComputeLevenshteinDistanceByWord, match_text
from data_utilities.util import WeightedLevenshtein
levenshtein = WeightedLevenshtein()
mode = "0"
import json
import math
from data_utilities.dibur_hamatchil_matcher import get_maximum_dh, ComputeLevenshteinDistanceByWord


class ScoreManager:
    def __init__(self, word_counts_file):
        with open(word_counts_file, "r") as fin:
            self.word_counts = json.load(fin)
        self.max_count = 0
        for word, count in self.word_counts.items():
            if count > self.max_count:
                self.max_count = count

    def word_count_score(self, w):
        max_score = 1
        wc = self.word_counts.get(w, None)
        score = 1 if wc is None else -math.log10(20 * (wc + (self.max_count / 10 ** max_score))) + math.log10(
            20 * self.max_count)
        return 3 * score