def test_migrate_to_complex_structure(): mappings = { "MigrateBook 1-2": "MigrateBook, Part 1", "MigrateBook 3:1-3": "MigrateBook, Part 2", "MigrateBook 4": "MigrateBook, Part 3" } new_schema = SchemaNode() new_schema.key = "MigrateBook" new_schema.add_title("MigrateBook", "en", primary=True) new_schema.add_title("הספר", "he", primary=True) depths = [2, 1, 2, 1, 1, 1, 1, 1, 1, 1] for i in range(10): ja = JaggedArrayNode() ja.add_title('Part {}'.format(i+1), 'en', primary=True) ja.add_title('חלק {}'.format(i+1), 'he', primary=True) ja.key = str(i) ja.depth = depths[i] ja.addressTypes = ["Integer"] * depths[i] ja.sectionNames = ["Paragraph"] * depths[i] new_schema.append(ja) new_schema.validate() schema.migrate_to_complex_structure("MigrateBook", new_schema.serialize(), mappings) with pytest.raises(BookNameError): library.get_index("Complex MigrateBook") children = library.get_index("MigrateBook").nodes.children assert children[0].full_title() == "MigrateBook, Part 1" assert children[1].full_title() == "MigrateBook, Part 2" p1, p2, p3, p4, p5, p6, p7 = get_text_for_simple_text() assert TextChunk(children[0].ref(), "en", 'Schema Test').text == [[p1, p7, p7, p7, p7], ["", p2]] assert TextChunk(children[1].ref(), "en", "Schema Test").text == ["", "", p3] assert TextChunk(children[2].ref(), "en", "Schema Test").text == [[p4]] assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 1:1', 'Guide for the Perplexed, Part 1'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 2:2', 'Guide for the Perplexed, Part 1 2'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 2 3', 'Guide for the Perplexed, Part 2 4-8'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 3 1', 'Guide for the Perplexed, Part 3 1'],}), Link) assert Link().load({'refs': ['MigrateBook 5:4', 'Guide for the Perplexed, Introduction, Introduction, 3'],}) is None assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 1:2-5', 'Genesis 3'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 2', 'Genesis 2'],}), Link) sheet_sources = get_sheet_refs(TEST_SHEET_ID) assert all(not re.match(r'^MigrateBook [0-9]', source) for source in sheet_sources) assert any('MigrateBook, Part 1' in source for source in sheet_sources) assert any('MigrateBook, Part 3' in source for source in sheet_sources) # we don't go from complex back to simple, so we'll delete and recreate the simple index\ # idempotency is broken in the event of a failed test library.get_index("MigrateBook").delete() create_simple_text()
def test_migrate_to_complex_structure(): mappings = {} mappings["MigrateBook 1-2"] = "MigrateBook, Part 1" mappings["MigrateBook 3:1-3"] = "MigrateBook, Part 2" mappings["MigrateBook 4"] = "MigrateBook, Part 3" try: library.get_index("Complex MigrateBook").delete() library.get_index("MigrateBook").delete() except BookNameError: pass index = Index().load({'title': 'MigrateBook'}) if index is not None: ls = LinkSet(Ref("MigrateBook")) ls.delete() ns = NoteSet({"ref": {"$regex": "MigrateBook.*"}}) ns.delete() index.delete() # Build an index with some nodes root = JaggedArrayNode() root.add_title('MigrateBook', 'en', primary=True) root.add_title('הספר', 'he', primary=True) root.key = 'MigrateBook' root.depth = 2 root.addressTypes = ["Integer", "Integer"] root.sectionNames = ["Siman", "Paragraph"] root.validate() index = Index({ 'schema': root.serialize(), 'title': 'MigrateBook', 'categories': ['Other'], }) index.save() p1 = "This should eventually end up in MigrateBook, Part 1, 1:1" p2 = "This should eventually end up in MigrateBook, Part 1, 2:2" p3 = "This should eventually end up in MigrateBook, Part 2, 3" p4 = "This should eventually end up in MigrateBook, Part 3, 1" p5 = "This will eventually go nowhere" p6 = "This text is just to allow for range 3:1-5" p7 = "This text is for 1:2-5" chunk = TextChunk(Ref('MigrateBook 1:1'), 'en', 'Schema Test') chunk.text = p1 chunk.save() for i in range(4): chunk = TextChunk(Ref("MigrateBook 1:{}".format(i + 2)), 'en', 'Schema Test') chunk.text = p7 chunk.save() chunk = TextChunk(Ref("MigrateBook 2:2"), 'en', 'Schema Test') chunk.text = p2 chunk.save() chunk = TextChunk(Ref("MigrateBook 3:3"), 'en', 'Schema Test') chunk.text = p3 chunk.save() chunk = TextChunk(Ref("MigrateBook 3:5"), 'en', 'Schema Test') chunk.text = p6 chunk.save() chunk = TextChunk(Ref("MigrateBook 4:1"), 'en', 'Schema Test') chunk.text = p4 chunk.save() chunk = TextChunk(Ref("MigrateBook 5:4"), 'en', 'Schema Test') chunk.text = p5 chunk.save() Link({ 'refs': ['MigrateBook 1:1', 'Guide for the Perplexed, Part 1'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 2:2', 'Guide for the Perplexed, Part 1 2'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 3:3', 'Guide for the Perplexed, Part 2 4-8'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 4:1', 'Guide for the Perplexed, Part 3 1'], 'type': 'None' }).save() Link({ 'refs': [ 'MigrateBook 5:4', 'Guide for the Perplexed, Introduction, Introduction 3' ], 'type': 'None' }).save() Link({'refs': ['MigrateBook 1:2-5', 'Genesis 3'], 'type': 'None'}).save() Link({'refs': ['MigrateBook 2', 'Genesis 2'], 'type': 'None'}).save() VersionState("MigrateBook").refresh() new_schema = SchemaNode() new_schema.key = "MigrateBook" new_schema.add_title("MigrateBook", "en", primary=True) new_schema.add_title("הספר", "he", primary=True) depths = [2, 1, 2, 1, 1, 1, 1, 1, 1, 1] for i in range(10): ja = JaggedArrayNode() ja.add_title('Part {}'.format(i + 1), 'en', primary=True) ja.add_title('חלק {}'.format(i + 1), 'he', primary=True) ja.key = str(i) ja.depth = depths[i] ja.addressTypes = ["Integer"] * depths[i] ja.sectionNames = ["Paragraph"] * depths[i] new_schema.append(ja) new_schema.validate() schema.migrate_to_complex_structure("MigrateBook", new_schema.serialize(), mappings) children = library.get_index("MigrateBook").nodes.children assert children[0].full_title() == "MigrateBook, Part 1" assert children[1].full_title() == "MigrateBook, Part 2" assert TextChunk(children[0].ref(), "en", 'Schema Test').text == [[p1, p7, p7, p7, p7], ["", p2]] assert TextChunk(children[1].ref(), "en", "Schema Test").text == ["", "", p3] assert TextChunk(children[2].ref(), "en", "Schema Test").text == [[p4]] assert isinstance( Link().load({ 'refs': ['MigrateBook, Part 1 1:1', 'Guide for the Perplexed, Part 1'], }), Link) assert isinstance( Link().load({ 'refs': ['MigrateBook, Part 1 2:2', 'Guide for the Perplexed, Part 1 2'], }), Link) assert isinstance( Link().load({ 'refs': ['MigrateBook, Part 2 3', 'Guide for the Perplexed, Part 2 4-8'], }), Link) assert isinstance( Link().load({ 'refs': ['MigrateBook, Part 3 1', 'Guide for the Perplexed, Part 3 1'], }), Link) assert Link().load({ 'refs': [ 'MigrateBook 5:4', 'Guide for the Perplexed, Introduction, Introduction, 3' ], }) is None assert isinstance( Link().load({ 'refs': ['MigrateBook, Part 1 1:2-5', 'Genesis 3'], }), Link) assert isinstance( Link().load({ 'refs': ['MigrateBook, Part 1 2', 'Genesis 2'], }), Link) library.get_index("MigrateBook").delete()
if __name__ == "__main__": files = [f for f in os.listdir('.') if f.endswith(".csv")] nodes = [] for f in files: schema = SchemaNode() en_title = f.replace(".csv", "") he_title = library.get_index(en_title).get_title('he') schema.add_primary_titles(en_title, he_title) reader = UnicodeReader(open(f)) for row in reader: he, en, first, last = row node = ArrayMapNode() node.add_primary_titles(en, he) node.depth = 0 node.wholeRef = "Arukh HaShulchan, {} {}-{}".format(en_title, first, last) node.refs = [] schema.append(node) nodes.append(schema.serialize()) index = get_index_api("Arukh HaShulchan", server="http://draft.sefaria.org") index['alt_structs'] = {"Subject": {"nodes": nodes}} # post_index(index, server="http://draft.sefaria.org") mapping = { "Arukh HaShulchan 1": "Arukh HaShulchan, Orach Chaim", "Arukh HaShulchan 2": "Arukh HaShulchan, Yoreh Deah", "Arukh HaShulchan 3": "Arukh HaShulchan, Even HaEzer", "Arukh HaShulchan 4": "Arukh HaShulchan, Choshen Mishpat" } migrate_to_complex_structure("Arukh HaShulchan", mapping)
def test_migrate_to_complex_structure(): try: library.get_index("Crazy").delete() library.get_index("Complex Crazy").delete() except BookNameError: pass index = Index().load({'title': 'Crazy'}) if index is not None: ls = LinkSet(Ref("Crazy")) ls.delete() ns = NoteSet({"ref": {"$regex": "Crazy.*"}}) ns.delete() index.delete() # Build an index with some nodes root = JaggedArrayNode() root.add_title('Crazy', 'en', primary=True) root.add_title(u'משוגע', 'he', primary=True) root.key = 'Crazy' root.depth = 2 root.addressTypes = ["Integer", "Integer"] root.sectionNames = ["Siman", "Paragraph"] root.validate() index = Index({ 'schema': root.serialize(), 'title': 'Crazy', 'categories': ['Craziness'], }) index.save() p1 = "Gonna be a Trump tower in every city?" p2 = "Maybe re-naming every organ of the body to Trump?" chunk = TextChunk(Ref('Crazy 1:1'), 'en', 'Schema Test') chunk.text = p1 chunk.save() chunk = TextChunk(Ref("Crazy 2:2"), 'en', 'Schema Test') chunk.text = p2 chunk.save() Link({ 'refs': ['Crazy 1:1', 'Guide for the Perplexed, Part 1'], 'type': 'None' }).save() Link({ 'refs': ['Crazy 2:2', 'Guide for the Perplexed, Part 2'], 'type': 'None' }).save() new_schema = SchemaNode() new_schema.key = "Crazy" new_schema.add_title("Crazy", "en", primary=True) new_schema.add_title(u"משוגע", "he", primary=True) j1 = JaggedArrayNode() j1.add_title('Trump', 'en', primary=True) j1.add_title(u'טראמפ', 'he', primary=True) j1.key = 'Trump' j1.depth = 1 j1.addressTypes = ["Integer"] j1.sectionNames = ["Paragraph"] j2 = JaggedArrayNode() j2.add_title('Americans', 'en', primary=True) j2.add_title(u'אמרקיים', 'he', primary=True) j2.key = 'Americans' j2.depth = 1 j2.addressTypes = ["Integer"] j2.sectionNames = ["Paragraph"] new_schema.append(j1) new_schema.append(j2) new_schema.validate() mappings = {} mappings["Crazy 1"] = "Crazy, Trump" mappings["Crazy 2"] = "Crazy, Americans" schema.migrate_to_complex_structure("Crazy", new_schema.serialize(), mappings) #Test that Crazy has two children named Trump and Americans, test the text, test the links children = library.get_index("Complex Crazy").nodes.children assert children[0].full_title() == "Complex Crazy, Trump" assert children[1].full_title() == "Complex Crazy, Americans" assert TextChunk(children[0].ref(), "en", 'Schema Test').text == [p1] assert TextChunk(children[1].ref(), "en", "Schema Test").text == ["", p2] assert isinstance(Link().load({'refs': ['Complex Crazy, Trump 1', 'Guide for the Perplexed, Part 1'],}), Link) assert isinstance(Link().load({'refs': ['Complex Crazy, Americans 2', 'Guide for the Perplexed, Part 2'],}), Link) library.get_index("Complex Crazy").delete() library.get_index("Crazy").delete()
def test_migrate_to_complex_structure(): mappings = {} mappings["MigrateBook 1-2"] = "MigrateBook, Part 1" mappings["MigrateBook 3:1-3"] = "MigrateBook, Part 2" mappings["MigrateBook 4"] = "MigrateBook, Part 3" try: library.get_index("Complex MigrateBook").delete() library.get_index("MigrateBook").delete() except BookNameError: pass index = Index().load({'title': 'MigrateBook'}) if index is not None: ls = LinkSet(Ref("MigrateBook")) ls.delete() ns = NoteSet({"ref": {"$regex": "MigrateBook.*"}}) ns.delete() index.delete() # Build an index with some nodes root = JaggedArrayNode() root.add_title('MigrateBook', 'en', primary=True) root.add_title(u'הספר', 'he', primary=True) root.key = 'MigrateBook' root.depth = 2 root.addressTypes = ["Integer", "Integer"] root.sectionNames = ["Siman", "Paragraph"] root.validate() index = Index({ 'schema': root.serialize(), 'title': 'MigrateBook', 'categories': ['Other'], }) index.save() p1 = "This should eventually end up in MigrateBook, Part 1, 1:1" p2 = "This should eventually end up in MigrateBook, Part 1, 2:2" p3 = "This should eventually end up in MigrateBook, Part 2, 3" p4 = "This should eventually end up in MigrateBook, Part 3, 1" p5 = "This will eventually go nowhere" p6 = "This text is just to allow for range 3:1-5" p7 = "This text is for 1:2-5" chunk = TextChunk(Ref('MigrateBook 1:1'), 'en', 'Schema Test') chunk.text = p1 chunk.save() for i in range(4): chunk = TextChunk(Ref("MigrateBook 1:{}".format(i+2)), 'en', 'Schema Test') chunk.text = p7 chunk.save() chunk = TextChunk(Ref("MigrateBook 2:2"), 'en', 'Schema Test') chunk.text = p2 chunk.save() chunk = TextChunk(Ref("MigrateBook 3:3"), 'en', 'Schema Test') chunk.text = p3 chunk.save() chunk = TextChunk(Ref("MigrateBook 3:5"), 'en', 'Schema Test') chunk.text = p6 chunk.save() chunk = TextChunk(Ref("MigrateBook 4:1"), 'en', 'Schema Test') chunk.text = p4 chunk.save() chunk = TextChunk(Ref("MigrateBook 5:4"), 'en', 'Schema Test') chunk.text = p5 chunk.save() Link({ 'refs': ['MigrateBook 1:1', 'Guide for the Perplexed, Part 1'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 2:2', 'Guide for the Perplexed, Part 1 2'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 3:3', 'Guide for the Perplexed, Part 2 4-8'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 4:1', 'Guide for the Perplexed, Part 3 1'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 5:4', 'Guide for the Perplexed, Introduction, Introduction 3'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 1:2-5', 'Genesis 3'], 'type': 'None' }).save() Link({ 'refs': ['MigrateBook 2', 'Genesis 2'], 'type': 'None' }).save() VersionState("MigrateBook").refresh() new_schema = SchemaNode() new_schema.key = "MigrateBook" new_schema.add_title("MigrateBook", "en", primary=True) new_schema.add_title(u"הספר", "he", primary=True) depths = [2, 1, 2, 1, 1, 1, 1, 1, 1, 1] for i in range(10): ja = JaggedArrayNode() ja.add_title('Part {}'.format(i+1), 'en', primary=True) ja.add_title(u'חלק {}'.format(i+1), 'he', primary=True) ja.key = str(i) ja.depth = depths[i] ja.addressTypes = ["Integer"] * depths[i] ja.sectionNames = ["Paragraph"] * depths[i] new_schema.append(ja) new_schema.validate() schema.migrate_to_complex_structure("MigrateBook", new_schema.serialize(), mappings) children = library.get_index("MigrateBook").nodes.children assert children[0].full_title() == "MigrateBook, Part 1" assert children[1].full_title() == "MigrateBook, Part 2" assert TextChunk(children[0].ref(), "en", 'Schema Test').text == [[p1, p7, p7, p7, p7], ["", p2]] assert TextChunk(children[1].ref(), "en", "Schema Test").text == ["", "", p3] assert TextChunk(children[2].ref(), "en", "Schema Test").text == [[p4]] assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 1:1', 'Guide for the Perplexed, Part 1'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 2:2', 'Guide for the Perplexed, Part 1 2'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 2 3', 'Guide for the Perplexed, Part 2 4-8'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 3 1', 'Guide for the Perplexed, Part 3 1'],}), Link) assert Link().load({'refs': ['MigrateBook 5:4', 'Guide for the Perplexed, Introduction, Introduction, 3'],}) is None assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 1:2-5', 'Genesis 3'],}), Link) assert isinstance(Link().load({'refs': ['MigrateBook, Part 1 2', 'Genesis 2'],}), Link) library.get_index("MigrateBook").delete()
""" old_bh_index = library.get_index(u"Ba'er Hetev on Shulchan Arukh, Even HaEzer") old_bh_alt_titles = [x for x in old_bh_index.nodes.get_titles_object() if not x.get(u'primary', False)] bh_schema_root = SchemaNode() bh_schema_root.add_primary_titles(old_bh_index.nodes.primary_title('en'), old_bh_index.nodes.primary_title('he')) default_node = JaggedArrayNode() default_node.default = True default_node.key = u'default' default_node.add_structure([u'Siman', u'Seif Katan']) bh_schema_root.append(default_node) halitzah_node = JaggedArrayNode() halitzah_node.add_primary_titles(u"Seder Halitzah", u"סדר חליצה") halitzah_node.add_structure([u"Seif Katan"]) bh_schema_root.append(halitzah_node) bh_schema_root.validate() # move all refs from 169:52 onward to seder halitzah reflist = Ref(u"Ba'er Hetev on Shulchan Arukh, Even HaEzer 169").all_segment_refs()[51:] conversion_map = {r.normal(): u"Ba'er Hetev on Shulchan Arukh, Even HaEzer, Seder Halitzah {}".format(i+1) for i, r in enumerate(reflist)} all_refs = [r.normal() for r in old_bh_index.all_segment_refs()] reg_refs = {r: r for r in all_refs if not conversion_map.get(r)} conversion_map.update(reg_refs) migrate_to_complex_structure(u"Ba'er Hetev on Shulchan Arukh, Even HaEzer", bh_schema_root.serialize(), conversion_map) new_bh_index = library.get_index(u"Ba'er Hetev on Shulchan Arukh, Even HaEzer") for title in old_bh_alt_titles: new_bh_index.nodes.add_title(**title) if os.path.exists("./output_Ba'er Hetev on Shulchan Arukh, Even HaEzer_.txt"): os.remove("./output_Ba'er Hetev on Shulchan Arukh, Even HaEzer_.txt")
mapping[orig_ref] = "{}, {}, {}".format( title, node_en, child_en) child_node.add_primary_titles(child_en, child_he) child_node.add_structure(new_structure[2:]) new_node.append(child_node) else: new_node = JaggedArrayNode() node_he, node_en = main_tag['text'].split(" / ") node_en = fix(node_en) orig_ref = "{} {}:1".format(title, main_count) mapping[orig_ref] = "{}, {}".format(title, node_en) new_node.add_primary_titles(node_en, node_he) new_node.add_structure(new_structure[2:]) root.append(new_node) root.validate() return root.serialize(), mapping if __name__ == "__main__": title = "Siddur, Edot HaMizrach" i = library.get_index(title) i.set_title("Siddur Edot HaMizrach") i.save() title = title.replace(",", "") he_title = i.get_title('he') filename = "siddur.xml" new_structure = i.schema['sectionNames'] schema, mapping = create_schema_and_map(title, he_title, filename, new_structure) migrate_to_complex_structure(title, schema, mapping)
even_haezer.key = node[0] default_node = JaggedArrayNode() default_node.key = "default" default_node.default = True default_node.add_structure(["Siman", "Paragraph"]) even_haezer.append(default_node) children = ["סדר הגט / Seder HaGet", "סדר חליצה / Seder Chalitza"] for child in children: he, en = child.split(" / ") he = he.decode('utf-8') child = JaggedArrayNode() child.add_primary_titles(en, he) child.add_structure(["Paragraph"]) even_haezer.append(child) root.append(even_haezer) else: ja_node = JaggedArrayNode() ja_node.add_primary_titles(node[0], node[1]) ja_node.key = node[0] ja_node.add_structure(["Siman", "Paragraph"]) ja_node.depth = 2 root.append(ja_node) schema = root.serialize() index = {"title": "Aruch HaShulchan", "schema": schema, "categories": ["Halakhah"]} migrate_to_complex_structure("Aruch HaShulchan", schema, mapping) aruch = library.get_index("Aruch HaShulchan") aruch.set_title("Arukh HaShulchan", "en") aruch.save()
child_en = fix(child_en) orig_ref = "{} {}:{}".format(title, main_count, child_count) mapping[orig_ref] = "{}, {}, {}".format(title, node_en, child_en) child_node.add_primary_titles(child_en, child_he) child_node.add_structure(new_structure[2:]) new_node.append(child_node) else: new_node = JaggedArrayNode() node_he, node_en = main_tag['text'].split(" / ") node_en = fix(node_en) orig_ref = "{} {}:1".format(title, main_count) mapping[orig_ref] = "{}, {}".format(title, node_en) new_node.add_primary_titles(node_en, node_he) new_node.add_structure(new_structure[2:]) root.append(new_node) root.validate() return root.serialize(), mapping if __name__ == "__main__": title = "Siddur, Edot HaMizrach" i = library.get_index(title) i.set_title("Siddur Edot HaMizrach") i.save() title = title.replace(",", "") he_title = i.get_title('he') filename = "siddur.xml" new_structure = i.schema['sectionNames'] schema, mapping = create_schema_and_map(title, he_title, filename, new_structure) migrate_to_complex_structure(title, schema, mapping)