def suggest_people_duplicates(): from data_utilities.util import WeightedLevenshtein norm2peeps = defaultdict(list) letter_counts = defaultdict(int) with open('/home/nss/sefaria/datasets/ner/sefaria/temp/yerushalmi_basic_en_titles.json', 'r') as fin: jin = json.load(fin) for person_obj in jin: title = person_obj['manualTitles'][0]['text'] norm_title = title.replace('Rebbi', '').replace('Rav', '').replace('‘', "'").replace('ˋ', "'").replace('`', "'").replace('ї', 'i').replace('ï', 'i').replace('î', 'i').replace('ĩ', 'i').replace('ś', 's').replace('š', 's').replace('Š', 'S').replace('ü', 'u').lower().replace('ẓ', 'z').replace('\u0301', '').replace('\u0308', '') norm_title = ' '.join(norm_title.split()) for char in norm_title: letter_counts[char] += 1 norm2peeps[norm_title] += [title] max_letter_count = max(letter_counts.values()) min_letter_count = min(letter_counts.values()) letter_freqs = {char: (count-min_letter_count)/(max_letter_count-min_letter_count) for char, count in letter_counts.items()} leven = WeightedLevenshtein(letter_freqs) clusters = [] for i, (norm_title, titles) in tqdm(enumerate(norm2peeps.items()), total=len(norm2peeps)): found_match = False for j, (norm_title2, titles2) in enumerate(list(norm2peeps.items())[i+1:]): score = leven.calculate(norm_title, norm_title2) if score > 85: print('MATCHED\n', norm_title, '\n', norm_title2, '\n', score) found_match = True norm2peeps[norm_title2] += titles # merge clusters break if not found_match: clusters += [titles] with open('/home/nss/sefaria/datasets/ner/sefaria/temp/yerushalmi_people_duplicates.json', 'w') as fout: json.dump(clusters, fout, indent=2, ensure_ascii=False)
def __init__(self): self.levenshtein = WeightedLevenshtein() self.matcher = None try: with codecs.open("word_counts.json", "rb", encoding="utf8") as fin: self.word_counts = json.load(fin) except IOError: self.word_counts = {}
def __init__(self): self.stop_words = [ u"ר'", u'רב', u'רבי', u'בן', u'בר', u'בריה', u'אמר', u'כאמר', u'וכאמר', u'דאמר', u'ודאמר', u'כדאמר', u'וכדאמר', u'ואמר', u'כרב', u'ורב', u'כדרב', u'דרב', u'ודרב', u'וכדרב', u'כרבי', u'ורבי', u'כדרבי', u'דרבי', u'ודרבי', u'וכדרבי', u"כר'", u"ור'", u"כדר'", u"דר'", u"ודר'", u"וכדר'", u'א״ר', u'וא״ר', u'כא״ר', u'דא״ר', u'דאמרי', u'משמיה', u'קאמר', u'קאמרי', u'לרב', u'לרבי', u"לר'", u'ברב', u'ברבי', u"בר'", u'הא', u'בהא', u'הך', u'בהך', u'ליה', u'צריכי', u'צריכא', u'וצריכי', u'וצריכא', u'הלל', u'שמאי', u"וגו'", u'וגו׳' ] self.levenshtein = WeightedLevenshtein()
for line in csv.reader(f): if line[0].startswith("Maskil"): ref, comm, found_ref, relevant_text = line rashi_ref = ref.replace("Maskil LeDavid, ", "Rashi on ") genesis_ref = ":".join(ref.replace("Maskil LeDavid, ", "").split(":")[:-1]) if genesis_ref not in maskil_refs: maskil_refs[genesis_ref] = [] maskil_refs[genesis_ref].append((ref, comm)) else: new_csv.append(line) exact_match = 0 more_than_2 = [] not_found = [] weighted = WeightedLevenshtein() for genesis_ref in maskil_refs: these_maskil_refs = maskil_refs[genesis_ref] these_rashi_refs = Ref("Rashi on {}".format(genesis_ref)).all_segment_refs() rashi_dhs = [get_dh(strip_nekud(ref.text('he').text), ref) for ref in these_rashi_refs] for maskil_ref, maskil_comm in these_maskil_refs: finds = [] for rashi_dh_tuple in rashi_dhs: rashi_ref, rashi_dh, rashi_dh_plus = rashi_dh_tuple rashi_dh = rashi_dh.replace(".", "") if maskil_comm.startswith(rashi_dh) or maskil_comm.startswith(rashi_dh.split()[0] + " "): finds.append((rashi_dh_tuple, maskil_ref, maskil_comm)) if len(finds) == 1: exact_match += 1
def __init__(self): self.sheets = OrderedDict() self.sheet = None self.levenshtein = WeightedLevenshtein()
class Nechama_parser(object): def __init__(self): self.sheets = OrderedDict() self.sheet = None self.levenshtein = WeightedLevenshtein() def bs4_reader(self, file_list_names): """ The main BeautifulSoup reader function, that etrates on all sheets and creates the obj, probably should be in it's own file :param self: :return: """ for html_sheet in file_list_names: content = BeautifulSoup(open("{}".format(html_sheet)), "lxml") print html_sheet top_dict = self.dict_from_html_attrs( content.find('div', { 'id': "contentTop" }).contents) # print 'len_content type ', len(top_dict.keys()) self.sheet = Sheet(html_sheet, top_dict["paging"].text, top_dict["h1"].text, top_dict["year"].text, top_dict["pasuk"].text) self.sheets[html_sheet] = self.sheet body_dict = self.dict_from_html_attrs( content.find('div', {'id': "contentBody"})) self.sheet.sections.extend([ v for k, v in body_dict.items() if re.search(u'ContentSection_\d', k) ]) # check that these come in in the right order self.sheet.sheet_remark = body_dict['sheetRemark'].text pass return self.sheets def dict_from_html_attrs(self, contents): d = OrderedDict() for e in [e for e in contents if isinstance(e, element.Tag)]: if "id" in e.attrs.keys(): d[e.attrs['id']] = e else: d[e.name] = e return d def get_score(self, words_a, words_b): normalizingFactor = 100 smoothingFactor = 1 ImaginaryContenderPerWord = 22 str_a = u" ".join(words_a) str_b = u" ".join(words_b) dist = self.levenshtein.calculate(str_a, str_b, normalize=False) score = 1.0 * (dist + smoothingFactor) / ( len(str_a) + smoothingFactor) * normalizingFactor dumb_score = (ImaginaryContenderPerWord * len(words_a)) - score return dumb_score def clean(self, s): s = unicodedata.normalize("NFD", s) s = strip_cantillation(s, strip_vowels=True) s = re.sub(u"(^|\s)(?:\u05d4['\u05f3])($|\s)", u"\1יהוה\2", s) s = re.sub(ur"[,'\":?.!;־״׳]", u" ", s) s = re.sub(ur"\([^)]+\)", u" ", s) # s = re.sub(ur"\((?:\d{1,3}|[\u05d0-\u05ea]{1,3})\)", u" ", s) # sefaria automatically adds pasuk markers. remove them s = bleach.clean(s, strip=True, tags=()).strip() s = u" ".join(s.split()) return s def tokenizer(self, s): return self.clean(s).split() def get_score(self, words_a, words_b): normalizingFactor = 100 smoothingFactor = 1 ImaginaryContenderPerWord = 22 str_a = u" ".join(words_a) str_b = u" ".join(words_b) dist = self.levenshtein.calculate(str_a, str_b, normalize=False) score = 1.0 * (dist + smoothingFactor) / ( len(str_a) + smoothingFactor) * normalizingFactor dumb_score = (ImaginaryContenderPerWord * len(words_a)) - score return dumb_score def check_reduce_sources(self, comment, ref): n = len(re.split(u'\s+', comment)) pm = ParallelMatcher(self.tokenizer, dh_extract_method=None, ngram_size=3, max_words_between=4, min_words_in_match=int(round(n * 0.8)), min_distance_between_matches=0, all_to_all=False, parallelize=False, verbose=False, calculate_score=self.get_score) new_ref = pm.match(tc_list=[ref.text('he'), (comment, 1)], return_obj=True) return new_ref
def __init__(self): self.stop_words = [] self.levenshtein = WeightedLevenshtein()
from sources.functions import * from research.mesorat_hashas_sefaria.mesorat_hashas import ParallelMatcher from research.link_disambiguator.main import Link_Disambiguator import os from functools import reduce from sefaria.utils.hebrew import strip_cantillation from data_utilities.dibur_hamatchil_matcher import get_maximum_dh, ComputeLevenshteinDistanceByWord, match_text from data_utilities.util import WeightedLevenshtein levenshtein = WeightedLevenshtein() mode = "0" import json import math from data_utilities.dibur_hamatchil_matcher import get_maximum_dh, ComputeLevenshteinDistanceByWord class ScoreManager: def __init__(self, word_counts_file): with open(word_counts_file, "r") as fin: self.word_counts = json.load(fin) self.max_count = 0 for word, count in self.word_counts.items(): if count > self.max_count: self.max_count = count def word_count_score(self, w): max_score = 1 wc = self.word_counts.get(w, None) score = 1 if wc is None else -math.log10(20 * (wc + (self.max_count / 10 ** max_score))) + math.log10( 20 * self.max_count) return 3 * score