def copy_from_local(): query = {"type": "sifrei mitzvot"} linkset = LinkSet(query) links = [l.contents() for l in linkset] # for link in links: # for i, ref in enumerate(link["refs"]): # if re.search("Sefer HaMitzvot", ref): # link["refs"][i] = "Sefer HaMitzvot LaRambam" # break # for link in links: # ref_strings = link["refs"] # for k, ref in enumerate(ref_strings): # if text.Ref(ref).primary_category == u'Tanakh': ## carfull Tanakh catagory refers also to Tanakh commentarys! # newrefs = ref_strings[:] # newrefs[k] = text.Ref(ref_strings[k]).section_ref().normal() # broadLink = Link().load({'refs': [newrefs[k], newrefs[(k + 1) % 2]]}) # if broadLink: # # raise DuplicateRecordError(u"more than one broader link exists: {} - {}".format(broadLink[0].refs[0], broadLink[0].refs[1]) # # # # tracker.delete(user, broadLink,) # broadLink.delete() # print 'deleting Link {} {}'.format(broadLink.refs[0], broadLink.refs[1]) post_link(links, VERBOSE=True) return links
def create_links(ls_ja): list_of_links = [] for perek_index, perek in enumerate(ls_ja): for mishna_index, mishna in enumerate(perek): for comment_index, comment in enumerate(mishna): list_of_links.append(create_link_dicttionary(perek_index+1, mishna_index+1, comment_index+1)) functions.post_link(list_of_links)
def parse_and_upload(): cards = get_cards() links = [] for card in cards: node = JaggedArrayNode() node.add_title(card, 'en', primary=True) node.add_title(u'רמב"ם ' + Ref(card.replace('Rambam ', '')).he_normal(), 'he', primary=True) node.key = card node.depth = 3 node.addressTypes = ['Integer', 'Integer', 'Integer'] node.sectionNames = ['Chapter', 'Mishnah', 'Comment'] node.validate() node.toc_zoom = 2 index = { 'title': card, 'categories': ['Commentary2', 'Mishnah', 'Rambam'], 'schema': node.serialize(), } parsed = parser(card) links.extend(parsed['links']) version = { 'versionTitle': u'Vilna Edition', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001300957', 'language': 'he', 'text': parsed['parsed'] } print 'posting {}'.format(card) post_index(index) post_text(card, version, index_count='on') post_link(links)
def postLinks(listOfPotentialLink, badLinksFile, goodLinksFile): for eachLink in listOfPotentialLink: if Ref.is_ref(eachLink['refs'][1]): functions.post_link(eachLink) goodLinksFile.write(eachLink['refs'][0] + ' linked with ' + eachLink['refs'][1] + '\r\n') else: badLinksFile.write(eachLink['refs'][0] + ' linked with ' + eachLink['refs'][1] + '\r\n')
def post(): minchat = {'name': 'Minchat Chinuch', 'text': produce_parsed_data(filename)} sefer = {'name': 'Sefer HaChinukh', 'text': Ref('Sefer HaChinukh').text('he').text} chinukh_links = find_links(minchat, sefer, grab_dh, u'<b>', u'</b>') with codecs.open('links.txt', 'w', 'utf-8') as outfile: for each_link in chinukh_links: outfile.write(u'{}\n'.format(each_link['refs'])) alt = construct_alt_struct('Chinukh_by_Parsha.csv', 'Chinukh Mitzva names.csv') cleaned = util.clean_jagged_array(minchat['text'], [m_pattern, comment_pattern, u'@[0-9]{2}', u'\n', u'\r']) with codecs.open('parsed.txt', 'w', 'utf-8') as outfile: util.jagged_array_to_file(outfile, cleaned, [u'Mitzva', u'Seif', u'Paragraph']) full_text = { 'versionTitle': 'Minchat Chinuch, Piotrków, 1902', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001175092', 'language': 'he', 'text': cleaned } index = construct_index(alt) functions.post_index(index) functions.post_text('Minchat Chinuch', full_text) functions.post_link(chinukh_links)
def upload(): links = [] for tractate in cards: he_name = Ref(' '.join(tractate.split()[1:])).he_normal() he_name = u'רמב"ם {}'.format(he_name) node = JaggedArrayNode() node.add_title(tractate, 'en', primary=True) node.add_title(he_name, 'he', primary=True) node.key = tractate node.depth = 3 node.addressTypes = ['Integer', 'Integer', 'Integer'] node.sectionNames = ['Chapter', 'Mishnah', 'Comment'] node.validate() index = { 'title': tractate, 'categories': ['Commentary2', 'Mishnah', 'Rambam'], 'schema': node.serialize(), 'toc_zoom': 2 } parsed = parse_file('{}.txt'.format(tractate)) links.extend(parsed['links']) version = { 'versionTitle': u'Vilna Edition', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001300957', 'language': 'he', 'text': parsed['parsed text'] } print 'posting {}'.format(tractate) post_index(index) post_text(tractate, version, index_count='on') post_link(links)
def post_links(self): links = [{ 'refs': [l[0], l[1]], 'type': 'commentary', 'auto': True, 'generated_by': 'Divrei Emet linker' } for l in self.stored_links] post_link(links)
def post_simple_commentaries(): ramban_node, rasag_node = JaggedArrayNode(), JaggedArrayNode() ramban_text = parse_general('yitzira_ramban.txt') rasag_text = parse_general('yitzira_rasag.txt') ramban_node.add_title("Ramban on Sefer Yetzirah", 'en', primary=True) ramban_node.add_title(u'רמב"ן על ספר יצירה', 'he', primary=True) ramban_node.key = "Ramban on Sefer Yetzirah" ramban_node.addressTypes = ['Integer', 'Integer', 'Integer'] ramban_node.sectionNames = ["Chapter", "Mishnah", "Comment"] ramban_node.toc_zoom = 2 ramban_node.depth = 3 ramban_node.validate() rasag_node.add_title("Rasag on Sefer Yetzirah", 'en', primary=True) rasag_node.add_title(u'רס"ג על ספר יצירה', 'he', primary=True) rasag_node.key = "Rasag on Sefer Yetzirah" rasag_node.addressTypes = ['Integer', 'Integer', 'Integer'] rasag_node.sectionNames = ["Chapter", "Mishnah", "Comment"] rasag_node.toc_zoom = 2 rasag_node.depth = 3 rasag_node.validate() ramban_index = { "title": "Ramban on Sefer Yetzirah", "categories": ["Commentary2", "Kabbalah", "Ramban"], "language": "he", "schema": ramban_node.serialize() } post_index(ramban_index) post_text( "Ramban on Sefer Yetzirah", { 'versionTitle': 'Ramban on Sefer Yetzirah, Warsaw 1884', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001310968', 'language': 'he', 'text': ramban_text }) rasag_index = { "title": "Rasag on Sefer Yetzirah", "categories": ["Commentary2", "Kabbalah", "Rasag"], "language": "he", "schema": rasag_node.serialize() } post_index(rasag_index) post_text( "Rasag on Sefer Yetzirah", { 'versionTitle': 'Rasage on Sefer Yetzirah, Warsaw 1884', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001310968', 'language': 'he', 'text': rasag_text }) links = linker(ramban_text, "Ramban on Sefer Yetzirah") links.extend(linker(rasag_text, "Rasag on Sefer Yetzirah")) post_link(links)
def post(self): for index in self.base_indices: post_index(index, weak_network=True) for index in self.commentaryIndices: post_index(index) for version in self.versionList: print version['ref'] post_text(version['ref'], version['version'], index_count='on', weak_network=True) post_link(self.linkSet)
def save_links_post_request(category): query = { "generated_by": "mesorat_hashas.cs (Dicta) {}".format(category), "auto": True, "type": "Automatic Mesorat HaShas" } ls = LinkSet(query) links = [l.contents() for l in ls] post_link(links)
def create_links(gra_ja): list_of_links = [] for perek_index, perek in enumerate(gra_ja): for mishna_index, mishna in enumerate(perek): for comment_index, comment in enumerate(mishna): list_of_links.append( create_link_dicttionary(perek_index + 1, mishna_index + 1, comment_index + 1)) functions.post_link(list_of_links)
def main(): text_dict = raavad_parse() post_raavad_index() post_raavad_text(text_dict) # save to mongo the links text <-> raavad. post_link(linking(text_dict['old_parsing_of_perush'])) # save to mongo links 32 <-> perush 32 post_link(link_32())
def post_the_text(jagged_array, title_counter): ref = create_ref(title_counter) text = create_text(jagged_array) if title_counter > 0: list_of_links = create_links(jagged_array, title_counter) #The post_text must be after the creation of the links because create_links() changes the actual text functions.post_text(ref, text) if title_counter > 0: print 1 functions.post_link(list_of_links)
def save_links_post_request(category): query = {"generated_by": "mesorat_hashas.py {}".format(category), "auto": True, "type": "Automatic Mesorat HaShas"} ls = LinkSet(query) links = [l.contents() for l in ls] i = 0 while i < len(links): print "Posting [{}:{}]".format(i, i+4999) post_link(links[i:i+5000]) i += 5000 pytime.sleep(10)
def post_ein_mishpat(massekhet): query = {"generated_by":"Ein Mishpat Cluster {}".format(massekhet)} # query_talmud = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet,massekhet)} # query_tush = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet)} # query_rambam = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet)} # query_semag = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet)} linkset = LinkSet(query) links = [l.contents() for l in linkset] # for l in links: # l["generated_by"] = "Ein Mishpat Cluster" post_link(links) return links
def post(): parsed = parse('targum.txt') for i in range(1, 3): functions.post_index(build_index(i)) version = { 'versionTitle': 'Wikisource Aramaic Targum to Chronicles', 'versionSource': url, 'language': 'he', 'text': parsed[i-1] } functions.post_text('Aramaic Targum to {} Chronicles'.format('I' * i), version) functions.post_link(build_links(parsed))
def post_simple_commentaries(): ramban_node, rasag_node = JaggedArrayNode(), JaggedArrayNode() ramban_text = parse_general('yitzira_ramban.txt') rasag_text = parse_general('yitzira_rasag.txt') ramban_node.add_title("Ramban on Sefer Yetzirah", 'en', primary=True) ramban_node.add_title(u'רמב"ן על ספר יצירה', 'he', primary=True) ramban_node.key = "Ramban on Sefer Yetzirah" ramban_node.addressTypes = ['Integer', 'Integer', 'Integer'] ramban_node.sectionNames = ["Chapter", "Mishnah", "Comment"] ramban_node.toc_zoom = 2 ramban_node.depth = 3 ramban_node.validate() rasag_node.add_title("Rasag on Sefer Yetzirah", 'en', primary=True) rasag_node.add_title(u'רס"ג על ספר יצירה', 'he', primary=True) rasag_node.key = "Rasag on Sefer Yetzirah" rasag_node.addressTypes = ['Integer', 'Integer', 'Integer'] rasag_node.sectionNames = ["Chapter", "Mishnah", "Comment"] rasag_node.toc_zoom = 2 rasag_node.depth = 3 rasag_node.validate() ramban_index = { "title": "Ramban on Sefer Yetzirah", "categories": ["Commentary2", "Kabbalah", "Ramban"], "language": "he", "schema": ramban_node.serialize() } post_index(ramban_index) post_text("Ramban on Sefer Yetzirah", { 'versionTitle': 'Ramban on Sefer Yetzirah, Warsaw 1884', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001310968', 'language': 'he', 'text': ramban_text }) rasag_index = { "title": "Rasag on Sefer Yetzirah", "categories": ["Commentary2", "Kabbalah", "Rasag"], "language": "he", "schema": rasag_node.serialize() } post_index(rasag_index) post_text("Rasag on Sefer Yetzirah", { 'versionTitle': 'Rasage on Sefer Yetzirah, Warsaw 1884', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001310968', 'language': 'he', 'text': rasag_text }) links = linker(ramban_text, "Ramban on Sefer Yetzirah") links.extend(linker(rasag_text, "Rasag on Sefer Yetzirah")) post_link(links)
def post_ein_mishpat(massekhet): query = {"generated_by": "Ein Mishpat Cluster {}".format(massekhet)} # query_talmud = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet,massekhet)} # query_tush = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet)} # query_rambam = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet)} # query_semag = {''' "generated_by": "Ein Mishpat Cluster {}", $and: [ {{ "refs.0": /.*{}.*/i }} ] '''.format(massekhet)} linkset = LinkSet(query) links = [l.contents() for l in linkset] # for l in links: # l["generated_by"] = "Ein Mishpat Cluster" post_link(links) return links
def save_links_post_request(category): query = { "generated_by": "mesorat_hashas.py {}".format(category), "auto": True, "type": "Automatic Mesorat HaShas" } ls = LinkSet(query) links = [l.contents() for l in ls] i = 0 while i < len(links): print "Posting [{}:{}]".format(i, i + 4999) post_link(links[i:i + 5000]) i += 5000 pytime.sleep(10)
def post(): post_index(construct_index()) base_text = restructure_text() links = build_links(base_text) version = { 'versionTitle': u'Derech Chaim, Maharal', 'versionSource': u'http://mobile.tora.ws/', 'language': 'he', 'text': base_text } post_text("Derech Chaim", version) version['text'] = get_intro() post_text("Derech Chaim, Author's Introduction", version, index_count='on') post_link(links)
def post(self): for index in self.base_indices: post_index(index, weak_network=True) for he_author in self.commentarySchemas.keys(): en_author = DCXMLsubs.commentatorNames[he_author] index = { 'title': en_author, 'categories': ['Commentary2', 'Masechtot Ketanot', en_author], 'schema': self.commentarySchemas[he_author].serialize() } post_index(index) for version in self.versionList: post_text(version['ref'], version['version'], index_count='on', weak_network=True) post_link(self.linkSet)
def post_unambiguous_links(post=False): links = [] with open(ROOT + "/unambiguous_links.csv", "r") as fin: cin = csv.DictReader(fin) for row in cin: link = { "generated_by": "link_disambiguator", "auto": True, "type": "", "refs": [row["Quoting Ref"], row["Quoted Ref"]] } links += [link] print("Total Links: {}".format(len(links))) if post: i = 0 batch_size = 50 while i < len(links): print("Posting [{}:{}]".format(i, i + batch_size - 1)) print(post_link(links[i:i + batch_size])) i += batch_size else: for link_obj in tqdm(links): try: Link(link_obj).save() except DuplicateRecordError: pass # poopy
def post(): books = file_to_books() for book in library.get_indexes_in_category('Torah'): books[book] = align_text(books[book], u'@\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', u'[0-9]{1,2}\.') functions.post_index(build_index()) node_names = ['Introduction'] + library.get_indexes_in_category('Torah') for name in node_names: version = { 'versionTitle': 'Tafsir al-Torah bi-al-Arabiya, Paris, 1893', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001863864', 'language': 'he', 'text': books[name] } functions.post_text('Tafsir Rasag, {}'.format(name), version) functions.post_link(build_links(books))
def post(): minchat = { 'name': 'Minchat Chinuch', 'text': produce_parsed_data(filename) } sefer = { 'name': 'Sefer HaChinukh', 'text': Ref('Sefer HaChinukh').text('he').text } chinukh_links = find_links(minchat, sefer, grab_dh, u'<b>', u'</b>') with codecs.open('links.txt', 'w', 'utf-8') as outfile: for each_link in chinukh_links: outfile.write(u'{}\n'.format(each_link['refs'])) alt = construct_alt_struct('Chinukh_by_Parsha.csv', 'Chinukh Mitzva names.csv') cleaned = util.clean_jagged_array( minchat['text'], [m_pattern, comment_pattern, u'@[0-9]{2}', u'\n', u'\r']) with codecs.open('parsed.txt', 'w', 'utf-8') as outfile: util.jagged_array_to_file(outfile, cleaned, [u'Mitzva', u'Seif', u'Paragraph']) full_text = { 'versionTitle': 'Minchat Chinuch, Piotrków, 1902', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001175092', 'language': 'he', 'text': cleaned } index = construct_index(alt) functions.post_index(index) functions.post_text('Minchat Chinuch', full_text) functions.post_link(chinukh_links)
def post_index_text_links(): tracs = library.get_indexes_in_category('Mishnah') parsed = parse_files() link_refs = [collect_links(tractate) for tractate in tracs] full_links = build_links(link_refs) for linker in full_links: add_dh_to_text(linker, parsed) for num, data in enumerate(sorted(parsed.keys())): print num + 1, data for attempt in range(3): try: upload(parsed[data], True) except URLError: print 'handling weak network' else: break else: raise URLError # util.ja_to_xml(parsed[data]['data'].array(), ['chapter', 'comment', 'line']) # break functions.post_link(full_links) os.remove('errors.html')
def add_links(full_text, upload=False): """ :param full_text: Data structure from parse_text() :param upload: set to True, otherwise function will do nothing """ if not upload: return for book in full_text.keys(): for chap_index, chapter in enumerate(full_text[book]): for verse_index, verse in enumerate(chapter): for comment in xrange(len(verse)): post_link({ 'refs':[ '{}.{}.{}'.format(book, chap_index+1, verse_index+1), 'Chizkuni,_{}.{}.{}.{}'.format(book, chap_index+1, verse_index+1, comment+1) ], 'type': 'commentary', 'auto': True, 'generated_by': 'Chizkuni linker' })
def post_links(): from sources.functions import post_link from sefaria.system.exceptions import DuplicateRecordError with open("research/parallel_matcher_scripts/final_selichot_links.csv", "r") as fin: c = csv.DictReader(fin) links = [] for row in c: if row["Keep?"] == "Yes": links += [{ "refs": [row["Tanakh Ref"], row["Selichot Ref"]], "auto": True, "generated_by": "selichot_edot_hamizrach_parallel_matcher" }] for l in links: link = Link(l) try: link.save() except DuplicateRecordError: print("DUP", l) post_link(links)
def post_index_text_links(): tracs = library.get_indexes_in_category('Mishnah') parsed = parse_files() link_refs = [collect_links(tractate) for tractate in tracs] full_links = build_links(link_refs) for linker in full_links: add_dh_to_text(linker, parsed) for num, data in enumerate(sorted(parsed.keys())): print num+1, data for attempt in range(3): try: upload(parsed[data], True) except URLError: print 'handling weak network' else: break else: raise URLError # util.ja_to_xml(parsed[data]['data'].array(), ['chapter', 'comment', 'line']) # break functions.post_link(full_links) os.remove('errors.html')
"refs": [ "HaGra on Sefer Yetzirah Gra Version " + '%d:%d:%d' % tuple(x + 1 for x in dh['indices']), "Sefer Yetzirah Gra Version " + '%d:%d' % tuple(x + 1 for x in dh['indices'][:2]), ], "type": "commentary", "auto": True, "generated_by": "gra_parse" }) dh_text = dh['data'] # append to links list gra_links.append(link) # shave off the last link of "slik" shpuldn't be linked in gra_links.pop() # save to mongo the list of dictionaries. post_link(gra_links) # link_ofen = ( # { # "refs": [ # "Pri Yitzhak on Sefer Yetzirah " + '%d:%d:%d' %tuple(x+1 for x in dh['indices']), # "Sefer Yetzirah Ari Version " + '%d:%d' %tuple(x+1 for x in dh['indices'][:2]), # ], # "type": "reference" # })
# -*- coding: utf-8 -*- import codecs import regex from sefaria.model import * from sources import functions from data_utilities import util from sources.Targum_Jerusalem_Hebrew import tjh_functions english_names = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy'] index = tjh_functions.create_index_record() functions.post_index(index) all_of_humash = tjh_functions.parse() for book, book_name in zip(all_of_humash, english_names): ref = 'Targum Jerusalem, {}'.format(book_name) text = tjh_functions.create_text(book) functions.post_text(ref, text) list_of_links = tjh_functions.create_links(all_of_humash) functions.post_link(list_of_links) testing_file = codecs.open("testing_file.txt", 'w', 'utf-8') util.jagged_array_to_file(testing_file, all_of_humash, ['Book', 'Chapter', 'Verse']) testing_file.close()
else: raise AssertionError("{} has {} comments".format(comment_ref.normal(), len(comment_links))) return {'add': add, 'remove': remove} server = 'http://*****:*****@55([\u05d0-\u05ea]{1,3})', u"Magen Avraham") to_add.extend(result['add']) to_remove.extend(result['remove']) for i in to_remove: r = requests.delete('{}/api/links/{}'.format(server, i._id)) print r.text to_add = [{ 'refs': i, 'type': 'commentary', 'auto': True, 'generated_by': 'Vilna Link Fixer' } for i in to_add] r = post_link(to_add, server=server)
first_b = x.find("<b>") second_b = x.find("</b>") x = x[first_b+3:second_b] x = x.replace(u"<b>", u"").replace(u"</b>", u"").replace(u"אלקים", u"אלהים").replace(u" כו", u"") if u"וכו'" in x: x = x.split(u"וכו'", 1)[0] return u" ".join(x.split(u" ")[0:10]).strip() base_tokenizer = lambda x: [x for x in x.split()] index = library.get_index("Maor VaShemesh") section_refs = index.all_section_refs() current_book = "Genesis" ls = [] for section in section_refs: print section section_text = section.text('he').text # section_dh_text = [get_dh(line) if "<b>" in line and "</b>" in line for text in section_text for line in text] ja = section.index_node ja_title = ja.get_primary_title() parasha = db.parshiot.find({"parasha": ja_title}) if list(parasha) != []: parasha = list(db.parshiot.find({"parasha": ja_title}))[0] current_parasha = parasha["ref"].split()[0] tc_current_book = Ref(current_parasha).text('he') matches = match_ref(tc_current_book, section_text, base_tokenizer, dh_extract_method=get_dh, word_threshold=0.35, char_threshold=0.26)["matches"] for i, match in enumerate(matches): if match: link = {"refs": [match.normal(), "{} {}".format(section.normal(), i+1)], "auto": True, "type": "Commentary", "generated_by": "maor_vashemesh"} ls.append(link) post_link(ls, server="http://proto.sefaria.org")
'versionSource': 'http://www.toratemetfreeware.com/', 'language': 'he', 'text': parsed_data[book] } functions.post_text('Siftei Hakhamim, {}'.format(book), version) def manual_links(): """ Some links had to be created manually by the content team. The refs to link were saved in a csv :return: Json object of links parsed from the aforementioned csv. """ with open('siftei hakhamim manual links.csv') as infile: csv_reader = ucsv.reader(infile, delimiter=';') links = [{'refs': [ref[0], ref[1]], 'type': 'commentary', 'auto': False, 'generated_by': 'Sefaria Content Team'} for ref in csv_reader] return links parsed = parse_multiple() slinks = generate_links(parsed) functions.post_index(build_index()) post_text(parsed) functions.post_link(slinks) functions.post_link(manual_links())
def post_all_smk(ja_smk, ja_raph, ja_hagahot, raph_links, hg_links): post_smk(ja_smk) post_raph(ja_raph) post_link(raph_links) post_hagahot(ja_hagahot) post_link(hg_links)
schema.validate() index = { "title": "Malbim on Genesis", "collective_title": "Malbim", "base_text_titles": ["Genesis"], "categories": ["Tanakh", "Torah", "Commentary", "Malbim"], "schema": schema.serialize() } return index def upload_text(parser): assert isinstance(parser, Malbim) book = parser.parsed_text version = { "versionTitle": "Malbim, Vilna Romm, 1892.", "versionSource": 'http://dlib.rsl.ru/viewer/01006563898#?page=1', "language": 'he', "text": book } functions.post_text("Malbim on Genesis", version, index_count='on') malbim = Malbim(path) functions.post_index(build_index(malbim)) upload_text(malbim) functions.post_link(build_links(malbim))
def save_links_post_request(): query = {"generated_by": "mesorat_hashas.py", "auto": True, "type": "Automatic Mesorat HaShas"} ls = LinkSet(query) links = [l.contents() for l in ls] post_link(links)
def post_links(links): functions.post_link(links)
'language': 'he', 'text': parsed_data[book] } functions.post_text('Siftei Hakhamim, {}'.format(book), version) def manual_links(): """ Some links had to be created manually by the content team. The refs to link were saved in a csv :return: Json object of links parsed from the aforementioned csv. """ with open('siftei hakhamim manual links.csv') as infile: csv_reader = ucsv.reader(infile, delimiter=';') links = [{ 'refs': [ref[0], ref[1]], 'type': 'commentary', 'auto': False, 'generated_by': 'Sefaria Content Team' } for ref in csv_reader] return links parsed = parse_multiple() slinks = generate_links(parsed) functions.post_index(build_index()) post_text(parsed) functions.post_link(slinks) functions.post_link(manual_links())
version, index_count="on", server=user_args.server) else: functions.post_text(book_name, version, index_count="on", server=user_args.server) flags = dict( versionTitleInHebrew=u'אשלי רברבי: שלחן ערוך יורה דעה, למברג תרמ"ח') if user_args.title is None: flags['priority'] = 2 functions.post_flags( dict(ref=book_name, lang='he', vtitle=version['versionTitle']), flags, user_args.server) if links: functions.post_link(links, server=user_args.server) if user_args.no_slack: pass else: requests.post(os.environ["SLACK_URL"], json={ "text": u"{} Upload Complete".format( user_args.title if user_args. title else u"Shulchan Arukh, Yoreh De'ah") })
def save_links_post_request(): query = {"generated_by":"dibur_hamatchil_matcher review","auto":False} ls = LinkSet(query) links = [l.contents() for l in ls] post_link(links)
node.sectionNames = ['Chapter', 'Verse', 'Comment'] node.toc_zoom = 2 record.append(node) record.validate() index = { "title": "Baal HaTurim", "categories": ["Commentary2", "Torah", "Baal HaTurim"], "schema": record.serialize() } return index def post_text(parsed_data): for book in library.get_indexes_in_category('Torah'): version = { 'versionTitle': 'Baal HaTurim', 'versionSource': 'http://www.toratemetfreeware.com/', 'language': 'he', 'text': parsed_data[book] } functions.post_text('Baal HaTurim, {}'.format(book), version) parsed = parse_multiple() links = linker(parsed) index = build_index() functions.post_index(index) post_text(parsed) functions.post_link(links)
with codecs.open('Manual links.txt', 'r', 'utf-8') as file_obj: manual_links_str = file_obj.read() manual_links = re.split(r'\n', manual_links_str) for item in manual_links: match = { 'refs': [ f'Eliyah Rabbah on Shulchan Arukh, Orach Chayim {item[:5]}', f'Shulchan Arukh, Orach Chayim {item[6:]}' ], 'auto': True, 'generated_by': 'Parse_ER - Manual Links', 'type': 'commentary', } links.append(match) #286:10 - 286:3 match = { 'refs': [ 'Eliyah Rabbah on Shulchan Arukh, Orach Chayim 286:10', 'Shulchan Arukh, Orach Chayim 286:3' ], 'auto': True, 'generated_by': 'Parse_ER - Manual Links', 'type': 'commentary', } links.append(match) post_link(links, server=server)
ja_smk = parse_semak('Semak.txt') # # siman_page = map_semak_page_siman(ja_smk, to_print=True) # letter_ja = parse_Raph_by_letter(u'Raph_on_Semak.txt') # raph_smk_alignment = raph_alignment_report(ja_smk, letter_ja) # ja_raph = parse_Raph_simanim(raph_smk_alignment) # # # post_raph(ja_raph) # # # link_raph(ja_raph) # try to find where this is coming from # raph = parse_Raph_by_letter('Raph_on_Semak.txt') # raph_links = link_raph(ja_smk, ja_raph) # ja_hagahot = parse_hagahot_by_letter(u'Semak_hagahot_chadashot.txt') # hgh_align = hagahot_alignment(ja_smk, ja_raph, ja_hagahot) # ja_hagahot = hagahot_parse(ja_hagahot, hgh_align) # hg_links = link_hg(ja_hagahot, hgh_align, ja_raph) # # # post_all_smk(ja_smk, ja_raph, ja_hagahot, raph_links, hg_links) # smg_links = link_smg(ja_smk, u'smg_smk_test') # post_link(smg_links, VERBOSE=True) post_link(link_remazim(), VERBOSE=True) # remazim_sm_g_k = link_smk_remazim_to_smg_remazim(smg_links) # post_link(remazim_sm_g_k, VERBOSE=True) # link_rambam("testrambamibid.txt") # get_citations(ja_smk, "exctract") # fromCSV(u'exctract.csv', u'newfile', u'full') old = 22 new = 23 # rewrtie_csv(u'fixed{}.csv'.format(old), u'fixed{}'.format(new), u'full', toWriteHeaders=[u'siman', u'smk_segment', u'rambam', u'smg', u'tur', u'full']) # rewrtie_csv(u'fixed{}.csv'.format(old), u'smk_links', u'full', toWriteHeaders=[u'siman', u'smk_segment', u'rambam', u'smg', u'tur', u'full']) # smkDerivenLinks, links_smg = link_rambam_smg_tur(u'fixed{}.csv'.format(new)) # post_link(smkDerivenLinks, VERBOSE=True) # remazim_sm_g_k = link_smk_remazim_to_smg_remazim(links_smg) # post_link(remazim_sm_g_k, VERBOSE=True)
node.addressTypes = ['Integer', 'Integer', 'Integer'] node.sectionNames = ['Chapter', 'Verse', 'Comment'] node.toc_zoom = 2 record.append(node) record.validate() index = { "title": "Siftei Hakhamim", "categories": ["Commentary2", "Torah", "Rashi"], "schema": record.serialize() } return index def post_text(parsed_data): for book in library.get_indexes_in_category('Torah'): version = { 'versionTitle': 'Siftei Hakhamim', 'versionSource': 'http://www.toratemetfreeware.com/', 'language': 'he', 'text': parsed_data[book] } functions.post_text('Siftei Hakhamim, {}'.format(book), version) parsed = parse_multiple() slinks = generate_links(parsed) functions.post_index(build_index()) post_text(parsed) functions.post_link(slinks)
for title in storage_object.get_index_titles()]) for category_list in categories: add_category(category_list[-1], category_list, server=destination) ensure_categories(storage, server) if num_processes > 1: pool = Pool(num_processes) pool.map(partial_upload_index, titles) pool.map(partial_upload_version, titles) else: regular_output = sys.stdout log_file = open('upload_log.log', 'w') progress_bar = tqdm(total=len(titles)) for t in titles: sys.stdout = log_file partial_upload_index(t) partial_upload_version(t) sys.stdout = regular_output progress_bar.update(1) log_file.close() post_link(storage.generate_links(), server=server, weak_network=True) with codecs.open('All_Peregrine_Titles.json', 'w', 'utf-8') as fp: json.dump(titles, fp) requests.post(os.getenv('SLACK_URL'), json={'text': 'Peregrine Complete :owl:'})
"Pri Yitzhak on Sefer Yetzirah Gra Version " + '%d:%d:%d' %tuple(x+1 for x in dh['indices']), "Sefer Yetzirah Gra Version " + '%d:%d' %tuple(x+1 for x in dh['indices'][:2]), ], "type": "commentary", "auto": True, "generated_by": "pri_yitzhak_parse" }) dh_text = dh['data'] # append to links list pri_links.append(link) # shave off the last link of "slik" shpuldn't be linked in pri_links.pop() return pri_links # find links in the pri that are of form bookName(chapter leter) def find_links_in_pri(): a = ur'\(.*?\)' with codecs.open('yitzira_pri_yitzhak.txt', 'r', 'utf-8') as fp: lines = fp.readlines() txt = " ".join(lines) books = library.get_titles_in_string(txt,'he') for b in books: site_re = ur"{}{}{} ".format('\(\s*',b,'.*?\)') site = regex.search(site_re,txt) if site: print site.span(), site.group() # save to mongo the list of dictionaries. post_link(link_pri(pri))
if user_args.add_term: add_term(user_args.title, book_xml.titles['he'], server=user_args.server) index = index_methods.get(book_name, create_simple_index)(en_title=book_name, he_title=he_book_name, commentator=user_args.title, server=user_args.server) if user_args.verbose: print index post_index(index, server=user_args.server) if post_parse.get(book_name): post_parse[book_name](book_ja) version = { 'versionTitle': "Shulhan Arukh, Hoshen ha-Mishpat; Lemberg, 1898", 'versionSource': "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH002097773", 'language': 'he', 'text': book_ja } post_text(book_name, version, index_count='on', server=user_args.server) if links: post_link(links, server=user_args.server) requests.post(os.environ['SLACK_URL'], json={'text': 'Upload Complete'})
# version = { # "versionTitle": "Maginei Eretz; Shulchan Aruch Orach Chaim, Lemberg, 1893", # "versionTitleInHebrew": u"""ספר מגיני ארץ; שלחן ערוך. למברג, תרנ"ג""", # "versionSource": "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH002084080", # "language": "he", # "text": book_ja, # } version = { "versionTitle": "Maginei Eretz: Shulchan Aruch Orach Chaim, Lemberg, 1893", "versionSource": "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH002084080", "language": "he", "text": book_ja, } functions.post_text(book_name, version, index_count="on", server=user_args.server) if links: functions.post_link(links, server=user_args.server) # for title, clean_func in post_parse.items(): # print # print title # comm = commentaries.get_commentary_by_title(title.split(" on")[0]) # comm = check_marks(comm, clean_func) # # print # print "Checking Orach Chaim" # base = check_marks(root.get_base_text(), orach_chaim_clean) functions.post_flags({'ref': book_name, 'lang': 'he', 'vtitle': version['versionTitle']}, {"versionTitleInHebrew": u"""ספר מגיני ארץ: שלחן ערוך. למברג, תרנ"ג""",}, user_args.server) try: requests.post(os.environ['SLACK_URL'], json={'text':'{} uploaded successfully'.format(book_name)})
if __name__ == "__main__": ja_smk = parse_semak('Semak.txt') # # siman_page = map_semak_page_siman(ja_smk, to_print=True) # letter_ja = parse_Raph_by_letter(u'Raph_on_Semak.txt') # raph_smk_alignment = raph_alignment_report(ja_smk, letter_ja) # ja_raph = parse_Raph_simanim(raph_smk_alignment) # # # post_raph(ja_raph) # # # link_raph(ja_raph) # try to find where this is coming from # raph = parse_Raph_by_letter('Raph_on_Semak.txt') # raph_links = link_raph(ja_smk, ja_raph) # ja_hagahot = parse_hagahot_by_letter(u'Semak_hagahot_chadashot.txt') # hgh_align = hagahot_alignment(ja_smk, ja_raph, ja_hagahot) # ja_hagahot = hagahot_parse(ja_hagahot, hgh_align) # hg_links = link_hg(ja_hagahot, hgh_align, ja_raph) # # # post_all_smk(ja_smk, ja_raph, ja_hagahot, raph_links, hg_links) # smg_links = link_smg(ja_smk, u'smg_smk_test') # post_link(smg_links, VERBOSE=True) # post_link(link_remazim(), VERBOSE=True) # remazim_sm_g_k = link_smk_remazim_to_smg_remazim(smg_links) # post_link(remazim_sm_g_k, VERBOSE=True) # link_rambam("testrambamibid.txt") # get_citations(ja_smk, "exctract") # fromCSV(u'exctract.csv', u'newfile', u'full') old = 22 new = 23 # rewrtie_csv(u'fixed{}.csv'.format(old), u'fixed{}'.format(new), u'full', toWriteHeaders=[u'siman', u'smk_segment', u'rambam', u'smg', u'tur', u'full']) smkDerivenLinks, links_smg = link_rambam_smg_tur(u'fixed{}.csv'.format(new)) post_link(smkDerivenLinks, VERBOSE=True) remazim_sm_g_k = link_smk_remazim_to_smg_remazim(links_smg) post_link(remazim_sm_g_k, VERBOSE=True)
gra_links = [] # use a generator to go over the text and find the 3 level indices for dh in traverse_ja(gra): link = ( { "refs": [ "HaGra on Sefer Yetzirah Gra Version " + '%d:%d:%d' %tuple(x+1 for x in dh['indices']), "Sefer Yetzirah Gra Version " + '%d:%d' %tuple(x+1 for x in dh['indices'][:2]), ], "type": "commentary", "auto": True, "generated_by": "gra_parse" }) dh_text = dh['data'] # append to links list gra_links.append(link) # shave off the last link of "slik" shpuldn't be linked in gra_links.pop() # save to mongo the list of dictionaries. post_link(gra_links) # link_ofen = ( # { # "refs": [ # "Pri Yitzhak on Sefer Yetzirah " + '%d:%d:%d' %tuple(x+1 for x in dh['indices']), # "Sefer Yetzirah Ari Version " + '%d:%d' %tuple(x+1 for x in dh['indices'][:2]), # ], # "type": "reference" # })