Example #1
0
    def handle(self, *args, **options):
        self.importer = OpenScripturesImport()

        # Abort if MS has already been added (or --force not supplied)
        self.importer.abort_if_imported("SBLGNT", options["force"])

        # Download the source file
        self.importer.download_resource(SOURCE_URL)

        # Create license
        if len(License.objects.filter(url="http://www.sblgnt.com/license/")) == 0:
            License.objects.create(name="SBLGNT License", abbreviation="SBLGNT", url="http://www.sblgnt.com/license/")

        # Create Works
        if len(Work.objects.filter(osis_slug="SBLGNT")) > 0:
            self.importer.delete_work(Work.objects.get(osis_slug="SBLGNT"))
        self.importer.work1 = Work(
            # id           = WORK1_ID,
            title="SBL Greek New Testament",
            language=Language("grc"),
            type="Bible",
            osis_slug="SBLGNT",
            publisher="Logos",
            publish_date=datetime.date(2010, 10, 28),
            import_date=datetime.datetime.now(),
            creator="Michael W. Holmes",
            source_url=SOURCE_URL,
            license=License.objects.get(url="http://www.sblgnt.com/license/"),
        )
        self.importer.work1.save()

        WorkServer.objects.create(work=self.importer.work1, server=Server.objects.get(is_self=True))

        # Get the subset of OSIS book codes provided on command line
        # limited_book_codes = []
        # for arg in args:
        # id_parts = arg.split(".")
        # if id_parts[0] in osis.BOOK_ORDERS["Bible"]["KJV"]:
        # limited_book_codes.append(id_parts[0])
        # book_codes = osis.BOOK_ORDERS["Bible"]["KJV"]
        # if len(limited_book_codes) > 0:
        # book_codes = limited_book_codes
        # self.importer.book_codes = book_codes
        self.importer.book_codes = osis.BOOK_ORDERS["Bible"]["KJV"]

        # Initialize the parser and set it up
        self.parser = xml.sax.make_parser()
        self.parser.setContentHandler(SBLGNTParser(self.importer))
        _zip = zipfile.ZipFile(os.path.basename(SOURCE_URL))
        self.parser.parse(StringIO.StringIO(_zip.read("sblgnt.xml")))
        print "Total tokens %d" % self.importer.tokenCount
        print "Total structures: %d" % self.importer.structCount
Example #2
0
    def handle(self, *args, **options):
        self.importer = OpenScripturesImport()

        # Abort if MS has already been added (or --force not supplied)
        self.importer.abort_if_imported("KJV", options["force"])

        # Download the source file
        self.importer.download_resource(SOURCE_URL)

        # Create Works
        if len(Work.objects.filter(osis_slug="KJV")) > 0:
            self.importer.delete_work(Work.objects.get(osis_slug="KJV"))
        self.importer.work1 = Work(
            # id           = WORK1_ID,
            title="King James Version (1769)",
            language=Language("eng"),
            type="Bible",
            osis_slug="KJV",
            publisher="Crosswire",
            publish_date=datetime.date(2006, 01, 01),
            import_date=datetime.datetime.now(),
            # creator      = "Michael W. Holmes",
            source_url=SOURCE_URL,
            license=License.objects.get(url="http://creativecommons.org/licenses/publicdomain/"),
        )
        self.importer.work1.save()

        WorkServer.objects.create(work=self.importer.work1, server=Server.objects.get(is_self=True))

        # Get the subset of OSIS book codes provided on command line
        # limited_book_codes = []
        # for arg in args:
        # id_parts = arg.split(".")
        # if id_parts[0] in osis.BOOK_ORDERS["Bible"]["KJV"]:
        # limited_book_codes.append(id_parts[0])
        # book_codes = osis.BOOK_ORDERS["Bible"]["KJV"]
        # if len(limited_book_codes) > 0:
        # book_codes = limited_book_codes
        self.importer.book_codes = osis.BOOK_ORDERS["Bible"]["KJV"]

        # Initialize the parser and set it up
        self.parser = xml.sax.make_parser()
        self.parser.setContentHandler(KJVParser(self.importer))
        _zip = zipfile.ZipFile(os.path.basename(SOURCE_URL))
        self.parser.parse(StringIO.StringIO(_zip.read("kjvlite.xml")))
        print "Total tokens %d" % self.importer.tokenCount
        print "Total structures: %d" % self.importer.structCount
Example #3
0
    def handle(self, *args, **options):
        importer = OpenScripturesImport()
        
        # Abort if MS has already been added (or --force not supplied)
        importer.abort_if_imported("Tischendorf", options["force"])

        # Download the source file
        importer.download_resource(SOURCE_URL)

        # Create Works
        # Delete existing works
        if len(Work.objects.filter(osis_slug="Tischendorf")) > 0:
            importer.delete_work(Work.objects.get(osis_slug="Tischendorf"))
        
        # Work for Qere edition (Kethiv is base text)
        importer.work1 = Work(
            title        = "Tischendorf 8th ed. v2.6 Qere (Corrected)",
            language     = Language('grc'),
            type         = 'Bible',
            osis_slug    = 'Tischendorf',
            publish_date = datetime.date(2010, 7, 4),
            import_date  = datetime.datetime.now(),
            #variant_bit  = WORK2_VARIANT_BIT,
            #variants_for_work = work1,
            creator      = "<a href='http://en.wikipedia.org/wiki/Constantin_von_Tischendorf' title='Constantin von Tischendorf @ Wikipedia'>Constantin von Tischendorf</a>. Based on G. Clint Yale's Tischendorf text and on Dr. Maurice A. Robinson's Public Domain Westcott-Hort text. Edited by <a href='http://www.hum.aau.dk/~ulrikp/'>Ulrik Sandborg-Petersen</a>.",
            source_url   = SOURCE_URL,
            license      = License.objects.get(url="http://creativecommons.org/licenses/publicdomain/")
        )
        importer.work1.save()
        WorkServer.objects.create(
            work = importer.work1,
            server = Server.objects.get(is_self = True)
        )

        # Get the subset of OSIS book codes provided on command line
        limited_book_codes = []
        for arg in args:
            id_parts = arg.split(".")
            if id_parts[0] in osis.BOOK_ORDERS["Bible"]["KJV"]:
                limited_book_codes.append(id_parts[0])
        importer.book_codes = osis.BOOK_ORDERS["Bible"]["KJV"]
        if len(limited_book_codes) > 0:
            importer.book_codes = limited_book_codes

        # Read each of the Book files
        _zip = zipfile.ZipFile(os.path.basename(SOURCE_URL))
        for book_code in importer.book_codes:
            if not BOOK_FILENAME_LOOKUP.has_key(book_code):
                continue

            importer.current_book = book_code
            importer.create_book_struct()
            
            lineNumber = -1

            importer.create_paragraph()

            for line in StringIO.StringIO(_zip.read("Tischendorf-2.6/Unicode/" + BOOK_FILENAME_LOOKUP[book_code])):
                in_paragraph = 0
                lineNumber += 1
                lineMatches = LINE_PARSER.match(unicodedata.normalize("NFC", unicode(line, 'utf-8')))
                if lineMatches is None:
                    print(" -- Warning: Unable to parse line: %s" % line) 
                    continue

                # Skip verses we're not importing right now
                #verse_osisid = book_code + "." + lineMatches.group('chapter') + "." + lineMatches.group('verse')
                #if len(limited_osis_ids) and len(grep(verse_osisid, limited_osis_ids)) != 0:
                #    continue

                # New Chapter start
                if lineMatches.group('chapter') != importer.current_chapter:
                    # End the previous chapter
                    importer.close_structure('chapter')

                    # Start the next chapter
                    importer.current_chapter = lineMatches.group('chapter')
                    importer.create_chapter_struct()
                    
                # New Verse start
                if lineMatches.group('verse') != importer.current_verse:
                    # End the previous verse
                    importer.close_structure('verse')

                    # Start the next verse
                    importer.current_verse = lineMatches.group('verse')
                    importer.create_verse_struct()

                # End paragraph
                if lineMatches.group('break') == 'P':
                    importer.create_paragraph()
                    in_paragraph = 1

                if not in_paragraph and len(importer.bookTokens) > 0:
                    importer.create_whitespace_token()


                #assert(lineMatches.group('kethivPunc') == lineMatches.group('qerePunc'))
                #assert(lineMatches.group('kethivStartBracket') == lineMatches.group('qereStartBracket'))
                #assert(lineMatches.group('kethivEndBracket') == lineMatches.group('qereEndBracket'))

                #if string.find(line, '[') != -1 or string.find(line, ']') != -1 or lineMatches.group('kethiv') != lineMatches.group('qere'):
                #    print line
                #continue


                # Open UNCERTAIN1 bracket
                if lineMatches.group("qereStartBracket"):
                    importer.create_uncertain()

                importer.create_token(lineMatches.group('qere'))
                # Make sure that structures only start on words
                importer.link_start_tokens()



                # Make this token the start of the UNCERTAIN structure
                if lineMatches.group('qereStartBracket'):
                    importer.structs['doubted'].start_token = importer.bookTokens[-1]

                # Qere token
                #if lineMatches.group('kethiv') != lineMatches.group('qere'):
                #    print("%s != %s" % (lineMatches.group('kethiv'), lineMatches.group('qere')))
                #    token_work2 = Token(
                #        id       = str(tokenCount),
                #        data     = lineMatches.group('qere'),
                #        type     = Token.WORD,
                #        work     = work,
                #        position = tokenCount,   #token_work1.position #should this be the same!?
                #        variant_bits = WORK2_VARIANT_BIT,
                #        relative_source_url = "#line(%d)" % lineNumber
                #        # What will happen with range?? end_token = work1, but then work2?
                #        # Having two tokens at the same position could mean that they are
                #        #  co-variants at that one spot. But then we can't reliably get
                #        #  tokens by a range? Also, the position can indicate transposition?
                #    )
                #    tokenCount += 1
                #    token_work2.save()
                #    lineTokens.append(token_work2)

                # Punctuation token
                #assert(lineMatches.group('kethivPunc') == lineMatches.group('qerePunc'))
                if lineMatches.group('qerePunc'):
                    importer.create_punct_token(lineMatches.group('qerePunc'))

                # Close UNCERTAIN1 bracket
                #assert(lineMatches.group('kethivEndBracket') == lineMatches.group('qereEndBracket'))
                if lineMatches.group('qereEndBracket'):
                    assert(importer.structs.has_key('doubted'))
                    print("### CLOSE BRACKET")

                    importer.structs['doubted'].end_token = importer.bookTokens[-1]

                    # Make end_marker for UNCERTAIN1
                    importer.create_punct_token("]")
                    # Close the UNCERTAIN1 structure
                    importer.structs['doubted'].end_marker = importer.bookTokens[-1]
                    importer.close_structure('doubted')
                

            for structElement in importer.structs.keys():
                importer.close_structure(structElement)

            importer.bookTokens = []
            

        print("structCount: %s" % str(importer.structCount))
        print("tokenCount:  %s" % str(importer.tokenCount))