class Command(BaseCommand): # Not implementing selecting books right now # args = '<Jude John ...>' # help = 'Limits the scope of the load to just to the books specified.' option_list = BaseCommand.option_list + ( make_option( "--force", action="store_true", dest="force", default=False, help="Force load despite it already being loaded", ), ) ### Main command handle below def handle(self, *args, **options): self.importer = OpenScripturesImport() # Abort if MS has already been added (or --force not supplied) self.importer.abort_if_imported("SBLGNT", options["force"]) # Download the source file self.importer.download_resource(SOURCE_URL) # Create license if len(License.objects.filter(url="http://www.sblgnt.com/license/")) == 0: License.objects.create(name="SBLGNT License", abbreviation="SBLGNT", url="http://www.sblgnt.com/license/") # Create Works if len(Work.objects.filter(osis_slug="SBLGNT")) > 0: self.importer.delete_work(Work.objects.get(osis_slug="SBLGNT")) self.importer.work1 = Work( # id = WORK1_ID, title="SBL Greek New Testament", language=Language("grc"), type="Bible", osis_slug="SBLGNT", publisher="Logos", publish_date=datetime.date(2010, 10, 28), import_date=datetime.datetime.now(), creator="Michael W. Holmes", source_url=SOURCE_URL, license=License.objects.get(url="http://www.sblgnt.com/license/"), ) self.importer.work1.save() WorkServer.objects.create(work=self.importer.work1, server=Server.objects.get(is_self=True)) # Get the subset of OSIS book codes provided on command line # limited_book_codes = [] # for arg in args: # id_parts = arg.split(".") # if id_parts[0] in osis.BOOK_ORDERS["Bible"]["KJV"]: # limited_book_codes.append(id_parts[0]) # book_codes = osis.BOOK_ORDERS["Bible"]["KJV"] # if len(limited_book_codes) > 0: # book_codes = limited_book_codes # self.importer.book_codes = book_codes self.importer.book_codes = osis.BOOK_ORDERS["Bible"]["KJV"] # Initialize the parser and set it up self.parser = xml.sax.make_parser() self.parser.setContentHandler(SBLGNTParser(self.importer)) _zip = zipfile.ZipFile(os.path.basename(SOURCE_URL)) self.parser.parse(StringIO.StringIO(_zip.read("sblgnt.xml"))) print "Total tokens %d" % self.importer.tokenCount print "Total structures: %d" % self.importer.structCount
def handle(self, *args, **options): importer = OpenScripturesImport() # Abort if MS has already been added (or --force not supplied) importer.abort_if_imported("Tischendorf", options["force"]) # Download the source file importer.download_resource(SOURCE_URL) # Create Works # Delete existing works if len(Work.objects.filter(osis_slug="Tischendorf")) > 0: importer.delete_work(Work.objects.get(osis_slug="Tischendorf")) # Work for Qere edition (Kethiv is base text) importer.work1 = Work( title = "Tischendorf 8th ed. v2.6 Qere (Corrected)", language = Language('grc'), type = 'Bible', osis_slug = 'Tischendorf', publish_date = datetime.date(2010, 7, 4), import_date = datetime.datetime.now(), #variant_bit = WORK2_VARIANT_BIT, #variants_for_work = work1, creator = "<a href='http://en.wikipedia.org/wiki/Constantin_von_Tischendorf' title='Constantin von Tischendorf @ Wikipedia'>Constantin von Tischendorf</a>. Based on G. Clint Yale's Tischendorf text and on Dr. Maurice A. Robinson's Public Domain Westcott-Hort text. Edited by <a href='http://www.hum.aau.dk/~ulrikp/'>Ulrik Sandborg-Petersen</a>.", source_url = SOURCE_URL, license = License.objects.get(url="http://creativecommons.org/licenses/publicdomain/") ) importer.work1.save() WorkServer.objects.create( work = importer.work1, server = Server.objects.get(is_self = True) ) # Get the subset of OSIS book codes provided on command line limited_book_codes = [] for arg in args: id_parts = arg.split(".") if id_parts[0] in osis.BOOK_ORDERS["Bible"]["KJV"]: limited_book_codes.append(id_parts[0]) importer.book_codes = osis.BOOK_ORDERS["Bible"]["KJV"] if len(limited_book_codes) > 0: importer.book_codes = limited_book_codes # Read each of the Book files _zip = zipfile.ZipFile(os.path.basename(SOURCE_URL)) for book_code in importer.book_codes: if not BOOK_FILENAME_LOOKUP.has_key(book_code): continue importer.current_book = book_code importer.create_book_struct() lineNumber = -1 importer.create_paragraph() for line in StringIO.StringIO(_zip.read("Tischendorf-2.6/Unicode/" + BOOK_FILENAME_LOOKUP[book_code])): in_paragraph = 0 lineNumber += 1 lineMatches = LINE_PARSER.match(unicodedata.normalize("NFC", unicode(line, 'utf-8'))) if lineMatches is None: print(" -- Warning: Unable to parse line: %s" % line) continue # Skip verses we're not importing right now #verse_osisid = book_code + "." + lineMatches.group('chapter') + "." + lineMatches.group('verse') #if len(limited_osis_ids) and len(grep(verse_osisid, limited_osis_ids)) != 0: # continue # New Chapter start if lineMatches.group('chapter') != importer.current_chapter: # End the previous chapter importer.close_structure('chapter') # Start the next chapter importer.current_chapter = lineMatches.group('chapter') importer.create_chapter_struct() # New Verse start if lineMatches.group('verse') != importer.current_verse: # End the previous verse importer.close_structure('verse') # Start the next verse importer.current_verse = lineMatches.group('verse') importer.create_verse_struct() # End paragraph if lineMatches.group('break') == 'P': importer.create_paragraph() in_paragraph = 1 if not in_paragraph and len(importer.bookTokens) > 0: importer.create_whitespace_token() #assert(lineMatches.group('kethivPunc') == lineMatches.group('qerePunc')) #assert(lineMatches.group('kethivStartBracket') == lineMatches.group('qereStartBracket')) #assert(lineMatches.group('kethivEndBracket') == lineMatches.group('qereEndBracket')) #if string.find(line, '[') != -1 or string.find(line, ']') != -1 or lineMatches.group('kethiv') != lineMatches.group('qere'): # print line #continue # Open UNCERTAIN1 bracket if lineMatches.group("qereStartBracket"): importer.create_uncertain() importer.create_token(lineMatches.group('qere')) # Make sure that structures only start on words importer.link_start_tokens() # Make this token the start of the UNCERTAIN structure if lineMatches.group('qereStartBracket'): importer.structs['doubted'].start_token = importer.bookTokens[-1] # Qere token #if lineMatches.group('kethiv') != lineMatches.group('qere'): # print("%s != %s" % (lineMatches.group('kethiv'), lineMatches.group('qere'))) # token_work2 = Token( # id = str(tokenCount), # data = lineMatches.group('qere'), # type = Token.WORD, # work = work, # position = tokenCount, #token_work1.position #should this be the same!? # variant_bits = WORK2_VARIANT_BIT, # relative_source_url = "#line(%d)" % lineNumber # # What will happen with range?? end_token = work1, but then work2? # # Having two tokens at the same position could mean that they are # # co-variants at that one spot. But then we can't reliably get # # tokens by a range? Also, the position can indicate transposition? # ) # tokenCount += 1 # token_work2.save() # lineTokens.append(token_work2) # Punctuation token #assert(lineMatches.group('kethivPunc') == lineMatches.group('qerePunc')) if lineMatches.group('qerePunc'): importer.create_punct_token(lineMatches.group('qerePunc')) # Close UNCERTAIN1 bracket #assert(lineMatches.group('kethivEndBracket') == lineMatches.group('qereEndBracket')) if lineMatches.group('qereEndBracket'): assert(importer.structs.has_key('doubted')) print("### CLOSE BRACKET") importer.structs['doubted'].end_token = importer.bookTokens[-1] # Make end_marker for UNCERTAIN1 importer.create_punct_token("]") # Close the UNCERTAIN1 structure importer.structs['doubted'].end_marker = importer.bookTokens[-1] importer.close_structure('doubted') for structElement in importer.structs.keys(): importer.close_structure(structElement) importer.bookTokens = [] print("structCount: %s" % str(importer.structCount)) print("tokenCount: %s" % str(importer.tokenCount))