def __init__(self, config): self.config = config #self.prod_db = "dbp_only" self.prod_db = "dbp" self.test_db = "valid_dbp" self.db = SQLUtility("localhost", 3306, "root", self.prod_db) self.tables = {} self.tables["bibles"] = [["id"], [ "language_id", "versification", "numeral_system_id", "date", "scope", "script", "derived", "copyright", "priority", "reviewed", "notes" ]] #self.tables["bible_filesets"] = [["hash_id"], ["id", "asset_id", "set_type_code", "hidden"]] # check set_size_code after bible_files self.tables["bible_fileset_connections"] = [[], []] self.tables["bible_translations"] = [[], []] self.tables["bible_books"] = [["bible_id", "book_id"], ["name", "name_short", "chapters"]] self.tables["bible_fileset_copyrights"] = [[], []] self.tables["bible_fileset_copyright_organizations"] = [[], []] self.tables["bible_fileset_tags"] = [[], []] self.tables["access_group_filesets"] = [["access_group_id", "hash_id"], []] self.tables["bible_files"] = [[ "hash_id", "book_id", "chapter_start", "verse_start" ], ["chapter_end", "verse_end", "file_name", "file_size", "duration"]] self.tables["bible_file_tags"] = (("file_id", "tag"), ("value", "admin_only"))
def __init__(self): self.db = SQLUtility(TIM_HOST, TIM_PORT, TIM_USER, TIM_DB_NAME) self.xmlRegex = re.compile(r"<filename src.*>(.*)</filename>") self.filenameRegex = re.compile( r"[A-Z0-9]+-[0-9]+-([A-Z1-4]+)-([0-9]+)-timing.txt") self.timingRegex = re.compile( r"([0-9\.]+)\t([0-9\.]+)\t([0-9]+)([a-z]?)")
class BibleBooksTable: def __init__(self, config): self.config = config self.validDB = SQLUtility(config.database_host, config.database_port, config.database_user, config.database_output_db_name) self.filesetList = self.validDB.select("SELECT id, set_type_code, hash_id FROM bible_filesets", None) self.bookIdList = self.validDB.select("SELECT id FROM books", None) bucket = BucketReader(config) self.filenames = bucket.filenames2() def bibleId(self): return None def bookId(self, typeCode, fileName): bookCode = None if typeCode == "aud": bookCode = fileName[0:5].strip('_') """ elif typeCode == "tex": parts = fileName.split('.')[0].split('_') if len(parts) > 2: bookCode = parts[-2] elif typeCode == "vid": parts = filename.split("_") for part in parts: if part.count('-')==2: ### this looks really bad, copied from main.py parts=part.split("-") if part in ["MAT","MRK","LUK","JHN"]: bookCode = part else: sys.exit() """ return bookCode def name(self, typeCode, filename): chapter = None if typeCode == "aud": chapter = filename[5:9].strip('_') elif typeCode == "tex": parts = filename.split('.')[0].split('_') if len(parts) > 2: if parts[-1].isdigit(): chapter = parts[-1] elif typeCode == "vid": chapter = None # TBD else: sys.exit() return chapter def nameShort(self, typeCode, filename): return None def chapters(self, typeCode, filename): return None
def __init__(self, config): self.db = SQLUtility(config) self.OT = self.db.selectSet( "SELECT id FROM books WHERE book_testament = 'OT'", ()) self.NT = self.db.selectSet( "SELECT id FROM books WHERE book_testament = 'NT'", ()) self.statements = []
def __init__(self): #self.config = config self.prod_db = "dbp_views" self.test_db = "valid_dbp_views" self.db = SQLUtility("localhost", 3306, "root", self.prod_db) self.tables = {} self.tables["bibles_view"] = ((("fileset_id", "bible_id"), ("language_id", "versification", "numeral_system_id", "date", "scope", "script", "derived", "copyright", "priority", "reviewed", "notes"))) self.tables["bible_filesets_view"] = ((("fileset_id", "asset_id", "set_type_code"), ("set_size_code", "hidden"))) self.tables["bible_fileset_copyrights_view"] = ( (("fileset_id", "asset_id", "set_type_code"), ("copyright_date", "copyright", "copyright_description", "open_access"))) self.tables["bible_fileset_copyright_organizations_view"] = ( (("fileset_id", "asset_id", "set_type_code", "organization_id"), ("organization_role", ))) self.tables["bible_fileset_tags_view"] = ((("fileset_id", "asset_id", "set_type_code", "name", "language_id"), ("description", "admin_only", "notes", "iso"))) self.tables["access_group_filesets_view"] = ((("fileset_id", "asset_id", "set_type_code", "access_group_id"), ())) self.tables["bible_files_view"] = ((("fileset_id", "asset_id", "set_type_code", "book_id", "chapter_start", "verse_start"), ("chapter_end", "verse_end", "file_name", "file_size", "duration"))) self.tables["bible_file_tags_view"] = ((("fileset_id", "asset_id", "set_type_code", "book_id", "chapter_start", "verse_start", "tag"), ("value", "admin_only"))) self.tables["bible_file_video_resolutions_view"] = ( (("fileset_id", "asset_id", "set_type_code", "file_name"), ("bandwidth", "resolution_width", "resolution_height", "codec", "stream"))) self.tables["bible_file_video_transport_stream_view"] = ( (("fileset_id", "asset_id", "set_type_code", "file_name"), ("runtime", )))
def __init__(self, config): self.config = config self.validDB = SQLUtility(config.database_host, config.database_port, config.database_user, config.database_output_db_name) self.filesetList = self.validDB.select("SELECT id, set_type_code, hash_id FROM bible_filesets", None) self.bookIdList = self.validDB.select("SELECT id FROM books", None) bucket = BucketReader(config) self.filenames = bucket.filenames2()
def getBookNameEnglish(self, bookId): if self.bookNamesEnglish == None: db = SQLUtility(self.config) sql = ( "SELECT book_id, name FROM book_translations WHERE language_id IN" " (SELECT id FROM languages WHERE iso=%s)") self.bookNamesEnglish = db.selectMap(sql, ("eng")) return self.bookNamesEnglish.get(bookId, "Unknown")
def __init__(self, config, dbOut, lptsReader): self.config = config self.db = SQLUtility(config) self.dbOut = dbOut self.lptsReader = lptsReader self.updateCounts = {} self.hashIdMap = None self.sqlLog = [] self.audioRegex = re.compile("([A-Z0-9]+)-([a-z]+)([0-9]+)")
def filesetIds(self, typeCode): db = SQLUtility(self.config.database_host, self.config.database_port, self.config.database_user, self.config.database_output_db_name) ids = db.selectList( "SELECT distinct fileset_id FROM bucket_listing WHERE type_code = %s", (typeCode)) db.close() return ids
def __init__(self, config): self.db = SQLUtility(config) self.db.execute("use dbp", ()) count = self.db.selectScalar("select count(*) from bibles", ()) print("count", count) self.textCopyrightSet = set() self.audioCopyrightSet = set() self.videoCopyrightSet = set() self.licensorSet = set() self.coLicensorSet = set() self.stripChars = r"[.,©℗\d/\-_\(\)]"
def legacyFilesetMap(self): if self.filesetMap == None: prodDB = SQLUtility(self.config.database_host, self.config.database_port, self.config.database_user, self.config.database_input_db_name) self.filesetMap = prodDB.selectMapList( "SELECT concat(id, set_type_code), asset_id FROM dbp.bible_filesets", None) prodDB.close() return self.filesetMap
def filenames(self, typeCode): db = SQLUtility(self.config.database_host, self.config.database_port, self.config.database_user, self.config.database_output_db_name) #Can I control the order of the files? #filenameList = db.selectMapList("SELECT fileset_id, file_name FROM bucket_listing WHERE length(file_name) > 16 and type_code = %s", (typeCode)) filenameList = db.selectMapList( "SELECT fileset_id, file_name FROM bucket_listing WHERE type_code = %s", (typeCode)) db.close() print("num in bucket %s" % (len(filenameList.keys()))) return filenameList
def createAnalysisDBP(self): self.privateCreateTable("dbp") sql = ("SELECT f.id, c.bible_id, f.set_type_code, f.asset_id" + " FROM bible_filesets f, bible_fileset_connections c" + " WHERE f.hash_id = c.hash_id") db = SQLUtility(config.database_host, config.database_port, config.database_user, "dbp") results = db.select(sql, None) db.close() print("num %d insert in dbp" % (len(results))) self.out.executeBatch("INSERT INTO dbp VALUES (%s, %s, %s, %s)", results)
def __init__(self, typeCode): db = SQLUtility("localhost", 3306, "root", "valid_dbp") #self.chapterMap = db.selectMap("SELECT id, chapters FROM books", None) #self.usfx2Map = db.selectMap("SELECT id_usfx, id FROM books", None) #self.usfx2Map['J1'] = '1JN' ## fix incorrect entry in books table #sql = ("SELECT concat(type_code, '/', bible_id, '/', fileset_id), file_name" # + " FROM bucket_listing where type_code=%s limit 1000000000") #sqlTest = (("SELECT concat(type_code, '/', bible_id, '/', fileset_id), file_name" # + " FROM bucket_listing where type_code=%s AND bible_id='PORERV'")) #filenamesMap = db.selectMapList(sql, (typeCode)) self.filenameList = db.selectList( "SELECT file_name FROM bucket_listing WHERE type_code=%s limit 10000000", (typeCode)) db.close()
def process(self): db = SQLUtility("localhost", 3306, "root", "dbp") sql = ( "SELECT c.bible_id, f.id, v.book_id, v.chapter, v.verse_start, v.verse_end" + " FROM bible_verses v, bible_filesets f, bible_fileset_connections c" + " WHERE v.hash_id = f.hash_id" + " AND v.hash_id = c.hash_id" + " AND f.hash_id = c.hash_id" + " AND f.asset_id = 'dbp-prod'") resultSet = db.select(sql, None) db.close() print("num verses %d" % (len(resultSet))) out = io.open("bible_verses.text", mode="w", encoding="utf-8") for row in resultSet: out.write("verse/%s/%s/%s_%s_%s_%s.text\n" % (row[0], row[1], row[2], row[3], row[4], row[5])) out.close()
def processFileset(self, inputFileset): inp = inputFileset print(inp.typeCode, inp.bibleId, inp.filesetId) dbConn = SQLUtility(self.config) bookIdSet = self.getBibleBooks(inp.typeCode, inp.csvFilename, inp.databasePath) updateBibleFilesSecondary = UpdateDBPBibleFilesSecondary( self.config, dbConn, self.dbOut) if inp.typeCode in {"audio", "video"}: setTypeCode = UpdateDBPFilesetTables.getSetTypeCode( inp.typeCode, inp.filesetId) hashId = self.insertBibleFileset(dbConn, inp.typeCode, setTypeCode, inp.bibleId, inp.filesetId, bookIdSet) self.insertFilesetConnections(dbConn, hashId, inp.bibleId) self.insertBibleFiles(dbConn, hashId, inputFileset, bookIdSet) updateBibleFilesSecondary.updateBibleFilesSecondary(hashId, inp) elif inp.typeCode == "text": if inp.subTypeCode() == "text_usx": hashId = self.insertBibleFileset(dbConn, inp.typeCode, "text_usx", inp.bibleId, inp.filesetId, bookIdSet) self.insertFilesetConnections(dbConn, hashId, inp.bibleId) self.insertBibleFiles(dbConn, hashId, inputFileset, bookIdSet) updateBibleFilesSecondary.updateBibleFilesSecondary( hashId, inp) elif inp.subTypeCode() == "text_format": hashId = self.insertBibleFileset(dbConn, inp.typeCode, "text_plain", inp.bibleId, inp.filesetId, bookIdSet) self.insertFilesetConnections(dbConn, hashId, inp.bibleId) self.textUpdater.updateFileset(inp.bibleId, inp.filesetId, hashId, bookIdSet, inp.databasePath) ## Future code for text_html #hashId = self.insertBibleFileset(inp.typeCode, "text_format", inp.bibleId, inp.filesetId, bookIdSet) #self.insertFilesetConnections(hashId, inp.bibleId) #self.textUpdater.updateFileset(inp.bibleId, inp.filesetId, hashId, bookIdSet, inp.databasePath) if inp.subTypeCode() != "text_usx": tocBooks = self.booksUpdater.getTableOfContents( inp.typeCode, inp.bibleId, inp.filesetId, inp.csvFilename, inp.databasePath) self.booksUpdater.updateBibleBooks(inp.typeCode, inp.bibleId, tocBooks) dbConn.close() return hashId
def process(self, typeCode): db = SQLUtility("localhost", 3306, "root", "valid_dbp") self.chapterMap = db.selectMap("SELECT id, chapters FROM books", None) self.usfx2Map = db.selectMap("SELECT id_usfx, id FROM books", None) self.usfx2Map['J1'] = '1JN' ## fix incorrect entry in books table sql = ( "SELECT concat(type_code, '/', bible_id, '/', fileset_id), file_name" + " FROM bucket_listing where type_code=%s limit 1000000000") sqlTest = (( "SELECT concat(type_code, '/', bible_id, '/', fileset_id), file_name" + " FROM bucket_listing where type_code=%s AND bible_id='PORERV'")) filenamesMap = db.selectMapList(sql, (typeCode)) db.close() if typeCode == "audio": templates = self.audioTemplates elif typeCode == "text": templates = self.textTemplates elif typeCode == "video": templates = self.videoTemplates else: print("ERROR: unknown type_code: %s" % (typeCode)) for prefix in filenamesMap.keys(): filenames = filenamesMap[prefix] (numErrors, template, files) = self.parseFileset(templates, prefix, filenames) if numErrors == 0: self.parsedList.append((template.name, prefix)) else: self.unparsedList.append((numErrors, template.name, prefix)) for file in files: if len(file.errors) > 0: print(prefix, file.file, ", ".join(file.errors))
def createAnalysisBucket(self): self.privateCreateTable("buckets") sql = ( "SELECT distinct fileset_id, bible_id, type_code, bucket FROM bucket_listing" ) db = SQLUtility(config.database_host, config.database_port, config.database_user, "valid_dbp") resultSet = db.select(sql, None) results = self.privateTransformType(resultSet) db.close() verses = VersesReader(self.config) verseIds = verses.bibleIdFilesetId() for verseId in verseIds: parts = verseId.split("/") results.append([parts[1], parts[0], "text_plain", "verses"]) print("num %d insert in buckets" % (len(results))) self.out.executeBatch("INSERT INTO buckets VALUES (%s, %s, %s, %s)", results)
def add(inputFileset): RunStatus.statusMap[inputFileset.filesetId] = RunStatus.NOT_DONE if RunStatus.dbConn == None: config = Config.shared() RunStatus.dbConn = SQLUtility(config) sql = "INSERT INTO run_history (username, location, directory) VALUES (%s, %s, %s)" username = pwd.getpwuid(os.getuid()).pw_name RunStatus.runId = RunStatus.dbConn.executeInsert( sql, (username, inputFileset.locationForS3(), inputFileset.filesetPath)) sql = "INSERT INTO run_batch (run_id, batch, status) VALUES (%s, %s, -1)" RunStatus.dbConn.execute(sql, (RunStatus.runId, inputFileset.filesetId))
class CreateDAFileset: def __init__(self): self.db = SQLUtility("localhost", 3306, "root", "hls_dbp") def process(self, filesetId, bitrate): resultSet = self.db.select( "SELECT hash_id, asset_id, set_type_code, set_size_code, hidden" " FROM bible_filesets WHERE id = %s", (filesetId)) row = resultSet[0] hashId = row[0] assetId = row[1] setTypeCode = row[2] setSizeCode = row[3] hidden = row[4] newFilesetId = filesetId + bitrate newHashId = self.getHashId(assetId, newFilesetId, setTypeCode) sql = ( "INSERT INTO bible_filesets (id, hash_id, asset_id, set_type_code, set_size_code, hidden)" " VALUES (%s, %s, %s, %s, %s, %s)") self.db.execute(sql, (newFilesetId, newHashId, assetId, setTypeCode, setSizeCode, hidden)) sql = ( "INSERT INTO bible_files (hash_id, book_id, chapter_start, chapter_end, verse_start, verse_end," " file_name, file_size, duration)" " SELECT %s, book_id, chapter_start, chapter_end, verse_start, verse_end," " file_name, file_size, duration FROM bible_files WHERE hash_id = %s" ) self.db.execute(sql, (newHashId, hashId)) def getHashId(self, bucket, filesetId, setTypeCode): md5 = hashlib.md5() md5.update(filesetId.encode("latin1")) md5.update(bucket.encode("latin1")) md5.update(setTypeCode.encode("latin1")) hash_id = md5.hexdigest() return hash_id[:12]
self.dbOut.updateCol(tableName, pkeyNames, updateRows) self.dbOut.delete(tableName, pkeyNames, deleteRows) def biblesVolumneName(self, bibleId, lptsRecords): final = set() for (lptsIndex, lptsRecord) in lptsRecords: volName = lptsRecord.Volumne_Name() if volName != None: final.add(volName) if len(final) == 0: return None elif len(final) > 1: #print("WARN: bible_id %s has multiple volumne_name |%s|" % (bibleId, "|".join(final))) return max(final, key=len) return list(final)[0] ## Unit Test if (__name__ == '__main__'): config = Config() db = SQLUtility(config) dbOut = SQLBatchExec(config) lptsReader = LPTSExtractReader(config) bibles = UpdateDBPBibleTranslations(config, db, dbOut, lptsReader) bibles.insertEngVolumeName() db.close() dbOut.displayStatements() dbOut.displayCounts() #dbOut.execute("test-bibles")
def process(self): self.filesets = self.db.select( "SELECT distinct asset_id, fileset_id, set_type_code, hash_id, legacy_asset_id FROM bucket_verse_summary", None) print("num filesets %d" % (len(self.filesets))) results = [] for fileset in bible.filesets: assetId = fileset[0] filesetId = fileset[1] setTypeCode = fileset[2] hashId = fileset[3] legacyAssetId = fileset[4] setSizeCode = "S" # initial setting updated later hidden = "0" results.append((filesetId, hashId, assetId, setTypeCode, setSizeCode, hidden, legacyAssetId)) print("num records to insert %d" % (len(results))) self.db.executeBatch( "INSERT INTO bible_filesets (id, hash_id, asset_id, set_type_code, set_size_code, hidden, legacy_asset_id) VALUES (%s, %s, %s, %s, %s, %s, %s)", results) config = Config() db = SQLUtility(config.database_host, config.database_port, config.database_user, config.database_output_db_name) bible = BibleFilesetsTable(config, db) bible.process() db.close()
" WHERE bf.id IN (SELECT bf.id FROM bible_files bf" " WHERE bf.hash_id='%s' AND bf.duration IS NULL)" " GROUP BY bf.id, bfvr.id) bfu" " ON bf.id=bfu.ID SET bf.duration=bfu.Duration;") self.dbOut.rawStatement(sql % (hashId,)) if (__name__ == '__main__'): from LPTSExtractReader import * from InputFileset import * from DBPLoadController import * config = Config.shared() lptsReader = LPTSExtractReader(config.filename_lpts_xml) filesets = InputFileset.filesetCommandLineParser(config, lptsReader) db = SQLUtility(config) ctrl = DBPLoadController(config, db, lptsReader) ctrl.validate(filesets) dbOut = SQLBatchExec(config) update = UpdateDBPFilesetTables(config, db, dbOut) for inp in InputFileset.upload: hashId = update.processFileset(inp.typeCode, inp.bibleId, inp.filesetId, inp.fullPath(), inp.csvFilename, inp.databasePath) if inp.typeCode == "video": video = UpdateDBPVideoTables(config, db, dbOut) video.processFileset(inp.filesetPrefix, inp.filenames(), hashId) dbOut.displayStatements() dbOut.displayCounts() dbOut.execute("test-" + inp.filesetId)
# for damId in lptsDamIds: # totalDamIdSet.add(damId) # sql = ("SELECT hash_id FROM bible_filesets WHERE id='%s' AND LEFT(set_type_code,4) = '%s'") # stmt = sql % (damId, typeCode[:4]) # hashIds = self.db.selectList(stmt, ()) # for hashId in hashIds: # sql = ("SELECT organization_id FROM bible_fileset_copyright_organizations" # " WHERE hash_id=%s AND organization_role = 1") # orgSet = self.db.selectSet(sql, (hashId)) # dbpOrgSet = dbpOrgSet.union(orgSet) # if hashIds != [] and dbpOrgSet != copyrightOrgSet: # print("ERROR:", hashId, damId, copyrightName, " LPTS:", copyrightOrgSet, " DBP:", dbpOrgSet) if (__name__ == '__main__'): config = Config() db = SQLUtility(config) dbOut = SQLBatchExec(config) lptsReader = LPTSExtractReader(config) orgs = LoadOrganizations(config, db, dbOut, lptsReader) orgs.changeCopyrightOrganizationsPrimaryKey() sql = ( "SELECT b.bible_id, bf.id, bf.set_type_code, bf.set_size_code, bf.asset_id, bf.hash_id" " FROM bible_filesets bf JOIN bible_fileset_connections b ON bf.hash_id = b.hash_id" " ORDER BY b.bible_id, bf.id, bf.set_type_code" " LOCK IN SHARE MODE") filesetList = db.select(sql, ()) print("num filelists", len(filesetList)) orgs.updateLicensors(filesetList) orgs.updateCopyrightHolders(filesetList) dbOut.displayStatements() dbOut.displayCounts()
class CompareView: def __init__(self): #self.config = config self.prod_db = "dbp_views" self.test_db = "valid_dbp_views" self.db = SQLUtility("localhost", 3306, "root", self.prod_db) self.tables = {} self.tables["bibles_view"] = ((("fileset_id", "bible_id"), ("language_id", "versification", "numeral_system_id", "date", "scope", "script", "derived", "copyright", "priority", "reviewed", "notes"))) self.tables["bible_filesets_view"] = ((("fileset_id", "asset_id", "set_type_code"), ("set_size_code", "hidden"))) self.tables["bible_fileset_copyrights_view"] = ( (("fileset_id", "asset_id", "set_type_code"), ("copyright_date", "copyright", "copyright_description", "open_access"))) self.tables["bible_fileset_copyright_organizations_view"] = ( (("fileset_id", "asset_id", "set_type_code", "organization_id"), ("organization_role", ))) self.tables["bible_fileset_tags_view"] = ((("fileset_id", "asset_id", "set_type_code", "name", "language_id"), ("description", "admin_only", "notes", "iso"))) self.tables["access_group_filesets_view"] = ((("fileset_id", "asset_id", "set_type_code", "access_group_id"), ())) self.tables["bible_files_view"] = ((("fileset_id", "asset_id", "set_type_code", "book_id", "chapter_start", "verse_start"), ("chapter_end", "verse_end", "file_name", "file_size", "duration"))) self.tables["bible_file_tags_view"] = ((("fileset_id", "asset_id", "set_type_code", "book_id", "chapter_start", "verse_start", "tag"), ("value", "admin_only"))) self.tables["bible_file_video_resolutions_view"] = ( (("fileset_id", "asset_id", "set_type_code", "file_name"), ("bandwidth", "resolution_width", "resolution_height", "codec", "stream"))) self.tables["bible_file_video_transport_stream_view"] = ( (("fileset_id", "asset_id", "set_type_code", "file_name"), ("runtime", ))) def comparePkey(self, table): mismatchCount = "SELECT count(*) FROM %s.%s p WHERE NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s)" mismatch = "SELECT %s FROM %s.%s p WHERE NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s) limit 500" self.genericComparePkey(table, mismatchCount, mismatch) #def comparePkeyAndFileset(self, table): # mismatchCount = "SELECT count(*) FROM %s.%s p, valid_dbp.bible_filesets f WHERE p.hash_id=f.hash_id AND NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s)" # mismatch = "SELECT f.id, f.set_type_code, %s FROM %s.%s p, valid_dbp.bible_filesets f WHERE p.hash_id=f.hash_id AND NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s) limit 500" # self.genericComparePkey(table, mismatchCount, mismatch) def genericComparePkey(self, table, sqlMismatchCount, sqlMismatch): print(table.upper()) tableDef = self.tables[table] pkey = tableDef[0] print("pKey: %s" % (",".join(pkey))) columns = tableDef[1] sqlCount = "SELECT count(*) FROM %s.%s" prodCount = self.db.selectScalar(sqlCount % (self.prod_db, table), None) testCount = self.db.selectScalar(sqlCount % (self.test_db, table), None) countRatio = int(round(100.0 * testCount / prodCount)) print("table %s counts: production=%d, test=%d, ratio=%d" % (table, prodCount, testCount, countRatio)) onClause = self.privateOnPhrase(pkey) sqlMatchCount = "SELECT count(*) FROM %s.%s p JOIN %s.%s t ON %s" matchCount = self.db.selectScalar( sqlMatchCount % (self.prod_db, table, self.test_db, table, onClause), None) #sqlMismatchCount = "SELECT count(*) FROM %s.%s p WHERE NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s)" prodMismatchCount = self.db.selectScalar( sqlMismatchCount % (self.prod_db, table, self.test_db, table, onClause), None) testMismatchCount = self.db.selectScalar( sqlMismatchCount % (self.test_db, table, self.prod_db, table, onClause), None) print("num match = %d, prod mismatch = %d, test mismatch = %d" % (matchCount, prodMismatchCount, testMismatchCount)) selectCols = [] for key in pkey: selectCols.append("p." + key) for col in columns: selectCols.append("p." + col) selectList = ", ".join(selectCols) # I think this is the only line is this method that is different #sqlMismatch = "SELECT f.id, f.set_type_code, %s FROM %s.%s p, bible_filesets f WHERE p.hash_id=f.hash_id AND NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s) limit 10" prodMismatches = self.db.select( sqlMismatch % (selectList, self.prod_db, table, self.test_db, table, onClause), None) testMismatches = self.db.select( sqlMismatch % (selectList, self.test_db, table, self.prod_db, table, onClause), None) for prodMismatch in prodMismatches: print("In prod not test: ", prodMismatch) for testMismatch in testMismatches: print("In test not prod: ", testMismatch) def compareBibleFilesetPKey(self, table): print(table.upper()) self.prod_db = "valid_dbp" self.test_db = "valid_dbp_views" prod_table = "bucket_verse_summary" tableDef = self.tables[table] pkey = ("fileset_id", "asset_id", "set_type_code") print("pKey: %s" % (",".join(pkey))) columns = pkey sqlCount = "SELECT count(*) FROM %s.%s" prodCount = self.db.selectScalar(sqlCount % (self.prod_db, prod_table), None) testCount = self.db.selectScalar(sqlCount % (self.test_db, table), None) countRatio = int(round(100.0 * testCount / prodCount)) print("table %s counts: production=%d, test=%d, ratio=%d" % (table, prodCount, testCount, countRatio)) onClause = self.privateOnPhrase(pkey) sqlMatchCount = "SELECT count(*) FROM %s.%s p JOIN %s.%s t ON %s" matchCount = self.db.selectScalar( sqlMatchCount % (self.prod_db, prod_table, self.test_db, table, onClause), None) if table == "bible_files_view": sqlMismatchCount = "SELECT count(*) FROM %s.%s p WHERE NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s) AND p.set_type_code NOT IN ('text_plain', 'app')" else: sqlMismatchCount = "SELECT count(*) FROM %s.%s p WHERE NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s)" prodMismatchCount = self.db.selectScalar( sqlMismatchCount % (self.prod_db, prod_table, self.test_db, table, onClause), None) testMismatchCount = self.db.selectScalar( sqlMismatchCount % (self.test_db, table, self.prod_db, prod_table, onClause), None) print("num match = %d, prod mismatch = %d, test mismatch = %d" % (matchCount, prodMismatchCount, testMismatchCount)) selectCols = [] for key in pkey: selectCols.append("p." + key) for col in columns: selectCols.append("p." + col) selectList = ", ".join(selectCols) if table == "bible_files_view": sqlMismatch = "SELECT %s FROM %s.%s p WHERE NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s) AND p.set_type_code NOT IN ('text_plain', 'app') limit 100" else: sqlMismatch = "SELECT %s FROM %s.%s p WHERE NOT EXISTS (SELECT 1 FROM %s.%s t WHERE %s) limit 100" prodMismatches = self.db.select( sqlMismatch % (selectList, self.prod_db, prod_table, self.test_db, table, onClause), None) testMismatches = self.db.select( sqlMismatch % (selectList, self.test_db, table, self.prod_db, prod_table, onClause), None) for prodMismatch in prodMismatches: print("In prod not test: ", prodMismatch) for testMismatch in testMismatches: print("In test not prod: ", testMismatch) def compareColumns(self, table): tableDef = self.tables[table] pkey = tableDef[0] columns = tableDef[1] fields = [] for col in columns: fields.append("p.%s" % (col)) selectList = ", ".join(fields) onClause = self.privateOnPhrase(pkey) fields = [] for key in pkey: fields.append("p.%s" % (key)) orderBy = ", ".join(fields) sql = "SELECT %s FROM %s.%s p JOIN %s.%s t ON %s ORDER BY %s" prodMatch = self.db.select( sql % (selectList, self.prod_db, table, self.test_db, table, onClause, orderBy), None) testMatch = self.db.select( sql % (selectList, self.test_db, table, self.prod_db, table, onClause, orderBy), None) print("num matches: prod: %d, test: %d" % (len(prodMatch), len(testMatch))) for col in range(len(columns)): prodEmptyCount = 0 prodTestDiffCount = 0 for rowIndex in range(len(prodMatch)): prodRow = prodMatch[rowIndex] testRow = testMatch[rowIndex] #print("compare %s %s %s" % (columns[col], prodRow[col], testRow[col])) if prodRow[col] != testRow[col]: if prodRow[col] == None or prodRow[col] == '': prodEmptyCount += 1 else: print("DIFF: %s prod: %s test: %s At Prod Row: %s" % (columns[col], prodRow[col], testRow[col], prodRow)) prodTestDiffCount += 1 numMatch = len(prodMatch) - prodEmptyCount - prodTestDiffCount print("COUNTS: %s match: %d prod empty: %d different: %d\n" % (columns[col], numMatch, prodEmptyCount, prodTestDiffCount)) def privateOnPhrase(self, pkey): onPhases = [] for key in pkey: onPhases.append("p.%s=t.%s" % (key, key)) return " AND ".join(onPhases) #def biblesPkey(self): # sqlMismatch = ("SELECT id FROM dbp_only.bibles WHERE id IN " + # " (select bible_id from dbp_only.bible_fileset_connections) " + # " AND id NOT IN (SELECT id FROM valid_dbp.bibles)") # prodMismatches = self.db.selectList(sqlMismatch, None) # for prodMismatch in prodMismatches: # print("prod mismatch2: %s" % (prodMismatch)) #def filesetId(self): # reader = LPTSExtractReader(self.config) # audioSet = set(reader.getAudioMap().keys()) # textSet = set(reader.getTextMap().keys()) # videoSet = set(reader.getVideoMap().keys()) # allSet = audioSet.union(textSet, videoSet) # sqlMismatch = ("SELECT id FROM dbp_only.bible_filesets WHERE id NOT IN" + # " (select id from valid_dbp.bible_filesets)") # prodMismatches = self.db.selectList(sqlMismatch, None) # for prodMismatch in prodMismatches: # if prodMismatch[:10] in allSet: # print("Found in fileId %s" % (prodMismatch)) def close(self): self.db.close()
def __init__(self): self.config = Config() self.db = SQLUtility(self.config) self.lptsReader = LPTSExtractReader(self.config)
class OneTimeUpdateDBSWEB: def __init__(self): self.config = Config() self.db = SQLUtility(self.config) self.lptsReader = LPTSExtractReader(self.config) def close(self): self.db.close() def process(self): count = 0 dbpProdMap = self.getDBPProdFiles() for (bibleId, filesetId) in self.getBibleFilesetList(): filesetList = self.getFileset(bibleId, filesetId) filesetList.prodFiles = dbpProdMap.get(bibleId + "/" + filesetId, 0) if filesetList.hasDBSWEBContent(): print(bibleId, filesetId) self.getNumVerses(filesetList) print(filesetList.toString()) self.processTextFormat(bibleId, filesetId, filesetList) self.processTextPlain(bibleId, filesetId, filesetList) #self.updateFilesetGroup(bibleId, filesetId, filesetList) count += 1 #if count > 100: # break def getDBPProdFiles(self): fileCountMap = {} fp = open(self.config.directory_bucket_list + "dbp-prod.txt") for line in fp: if line.startswith("text/"): parts = line.split("/") if len(parts) > 3 and len(parts[1]) < 20 and len(parts[2]) < 20: key = parts[1] + "/" + parts[2] count = fileCountMap.get(key, 0) count += 1 fileCountMap[key] = count fp.close() return fileCountMap def getBibleFilesetList(self): sql = ("SELECT distinct bfc.bible_id, bs.id" " FROM bible_filesets bs" " JOIN bible_fileset_connections bfc ON bfc.hash_id = bs.hash_id") resultSet = self.db.select(sql, ()) return resultSet def getFileset(self, bibleId, filesetId): sql = ("SELECT bs.asset_id, bs.set_type_code, bs.hash_id, count(*)" " FROM bible_filesets bs" " JOIN bible_fileset_connections bfc ON bfc.hash_id=bs.hash_id" " LEFT JOIN bible_files bf ON bs.hash_id=bf.hash_id" " WHERE bfc.bible_id = %s" " AND bs.id = %s" " GROUP BY bs.asset_id, bs.set_type_code, bs.hash_id") resultSet = self.db.select(sql, (bibleId, filesetId)) #for row in resultSet: # print(row) bibleFileset = BibleFileset(resultSet) (lptsRecord, index) = self.lptsReader.getLPTSRecordLoose("text", bibleId, filesetId) if lptsRecord != None: bibleFileset.stockNo = lptsRecord.Reg_StockNumber() return bibleFileset def getNumVerses(self, fileset): sql = "SELECT count(*) FROM bible_verses WHERE hash_id = %s" if fileset.dbsFormat != None: fileset.dbsFormat.numVerses = self.db.selectScalar(sql, (fileset.dbsFormat.hashId)) if fileset.dbsPlain != None: fileset.dbsPlain.numVerses = self.db.selectScalar(sql, (fileset.dbsPlain.hashId)) if fileset.fcbhFormat != None: fileset.fcbhFormat.numVerses = self.db.selectScalar(sql, (fileset.fcbhFormat.hashId)) if fileset.fcbhPlain != None: fileset.fcbhPlain.numVerses = self.db.selectScalar(sql, (fileset.fcbhPlain.hashId)) def processTextFormat(self, bibleId, filesetId, fileset): if fileset.dbsFormat != None: hashId = fileset.dbsFormat.hashId if fileset.dbsFormat.numFiles > 10 and fileset.prodFiles > 10: if fileset.fcbhFormat == None: fileset.dbsFormat2fcbhFormat = True if fileset.stockNo != None: print("WARN1 %s %s/%s dbs-web has LPTS record %s and %d files in dbp-prod." % (hashId, bibleId, filesetId, fileset.stockNo, fileset.dbsFormat.numFiles)) else: print("WARN2 %s %s/%s dbs-web has %d files in dbp-prod, but no LPTS record." % (hashId, bibleId, filesetId, fileset.dbsFormat.numFiles)) elif fileset.fcbhFormat != None and (fileset.fcbhFormat.numFiles == 0 or fileset.prodFiles == 0): fileset.dbpFormatDelete = True fileset.dbsFormat2fcbhFormat = True #print("DELETE2 %s %s/%s dbp-prod text_format" % (fileset.fcbhFormat.hashId, bibleId, filesetId)) print("WARN99 %s %s/%s dbs-web has files in dbp-prod and identical dbp-prod fileset with no files." % (hashId, bibleId, filesetId)) else: if fileset.stockNo != None: print("WARN3 %s %s/%s dbs-web LPTS record %s, but no files in dbp-prod." % (hashId, bibleId, filesetId, fileset.stockNo)) else: print("WARN4: %s %s/%s dbs-web text_format has no files in dbp-prod and no LPTS record." % (hashId, bibleId, filesetId)) if fileset.dbsFormat.numVerses > 100: print("WARN98: %s %s/%s dbs-web text_format has %d verses." % (hashId, fileset.dbsFormat.numFiles, bibleId, filesetId)) def processTextPlain(self, bibleId, filesetId, fileset): if fileset.dbsPlain != None: hashId = fileset.dbsPlain.hashId if fileset.fcbhFormat != None or fileset.dbsFormat2fcbhFormat: if fileset.dbsPlain.numVerses > 100: if fileset.fcbhPlain == None: fileset.dbsPlain2fcbhPlain = True print("CHANGE3 %s %s/%s dbs-web text_plain to db" % (hashId, bibleId, filesetId)) elif fileset.fcbhPlain != None and fileset.fcbhPlain.numVerses == 0: fileset.dbpPlainDelete = True fileset.dbsPlain2fcbhPlain = True print("DELETE4 %s %s/%s dbp-prod text_plain" % (fileset.fcbhFormat.hashId, bibleId, filesetId)) print("CHANGE3 %s %s/%s dbs-web text_plain to db" % (hashId, bibleId, filesetId)) if fileset.dbsPlain.numVerses == 0: print("WARn: %s %s/%s dbs-web text_plain has no verses" % (hashId, bibleId, filesetId)) if fileset.dbsPlain.numFiles > 0: print("WARn: %s %s/%s dbs-web text_plain has %d files" % (hashId, bibleId, filesetId, fileset.dbsPlain.numFiles)) def updateFilesetGroup(self, bibleId, filesetId, fileset): self.statements = [] if fileset.dbpFormatDelete: self.deleteFileset(bibleId, filesetId, "text_format", fileset) if fileset.dbpPlainDelete: self.deleteFileset(bibleId, filesetId, "text_plain", fileset) if fileset.dbsFormat2fcbhFormat: self.replaceFileset(bibleId, filesetId, "text_format", fileset.dbsFormat) if fileset.dbsPlain2fcbhPlain: self.replaceFileset(bibleId, filesetId, "text_plain", fileset.dbsPlain) #self.db.executeTransaction(self.statements) self.db.displayTransaction(self.statements) def deleteFileset(self, bibleId, filesetId, typeCode, fileset): stmt = "DELETE bible_fileset WHERE hash_id = %s" self.statements.append((stmt, [(fileset.hashId)])) def replaceFileset(self, bibleId, filesetId, typeCode, fileset): bucket = "db" if typeCode == "text_plain" else "dbp-prod" newHashId = self.getHashId(bucket, filesetId, typeCode) stmt = "UPDATE bible_filesets SET hash_id = %s, asset_id = %s WHERE hash_id = %s" self.statements.append((stmt, [(newHashId, bucket, fileset.hashId)])) tableNames = ("access_group_filesets", "bible_fileset_connections", "bible_fileset_tags", "bible_fileset_copyrights", "bible_fileset_copyright_organizations", "bible_files", "bible_verses") for table in tableNames: stmt = "UPDATE %s" % (table) + " SET hash_id = %s WHERE hash_id = %s" self.statements.append((stmt, [(newHashId, fileset.hashId)])) #attrNames = ("hash_id", "id", "asset_id", "set_type_code", "set_size_code", "hidden", "created_at", "updated_at") #attrNames = ("hash_id", "access_group_id", "created_at", "updated_at") #attrNames = ("hash_id", "bible_id", "created_at", "updated_at") #attrNames = ("hash_id", "name", "description", "admin_only", "notes", "iso", "language_id", "created_at", "updated_at") #attrNames = ("hash_id", "copyright_date", "copyright", "copyright_description", "created_at", "updated_at", "open_access") #attrNames = ("hash_id", "organization_id", "organization_role", "created_at", "updated_at") #attrNames = ("hash_id", "book_id", "chapter_start", "chapter_end", "verse_start", "verse_end", # "file_name", "file_size", "duration", "created_at", "updated_at") #attrNames = ("hash_id", "book_id", "chapter", "verse_start", "verse_end", "verse_text") def getHashId(self, bucket, filesetId, setTypeCode): md5 = hashlib.md5() md5.update(filesetId.encode("latin1")) md5.update(bucket.encode("latin1")) md5.update(setTypeCode.encode("latin1")) hash_id = md5.hexdigest() return hash_id[:12] """
def __init__(self): self.db = SQLUtility("localhost", 3306, "root", "hls_dbp")
if (__name__ == '__main__'): from PreValidate import * from InputFileset import * if len(sys.argv) < 4: print( "Usage: UpdateDBPBibleFilesSecondary.py your_profile starting_bible_id ending_bible_id" ) sys.exit() startingBibleId = sys.argv[2] endingBibleId = sys.argv[3] config = Config.shared() db = SQLUtility(config) dbOut = SQLBatchExec(config) update = UpdateDBPBibleFilesSecondary(config, db, dbOut) s3Client = AWSSession.shared().s3Client lptsReader = LPTSExtractReader(config.filename_lpts_xml) sql = ( "SELECT c.bible_id, f.id, f.hash_id FROM bible_filesets f, bible_fileset_connections c" " WHERE f.hash_id = c.hash_id AND set_type_code in ('audio', 'audio_drama') AND length(f.id) = 10" " AND c.bible_id >= %s AND c.bible_id <= %s") resultSet = db.select(sql, (startingBibleId, endingBibleId)) for (bibleId, filesetId, hashId) in resultSet: print(bibleId, filesetId, hashId) location = "s3://%s" % (config.s3_bucket, ) filesetPath = "audio/%s/%s" % (bibleId, filesetId) (dataList,
class UpdateDBPVerseTable: def __init__(self, config): self.db = SQLUtility(config) self.OT = self.db.selectSet( "SELECT id FROM books WHERE book_testament = 'OT'", ()) self.NT = self.db.selectSet( "SELECT id FROM books WHERE book_testament = 'NT'", ()) self.statements = [] def loadVerseTable(self): sql = "SELECT id, hash_id FROM %s.bible_filesets WHERE set_type_code = 'text_plain'" % ( SOURCE_DATABASE) filesetIdMap = self.db.selectMap(sql, ()) print(len(filesetIdMap.keys()), "verse filesets found") for filesetId in sorted(filesetIdMap.keys()): hashId = filesetIdMap[filesetId] print(filesetId) self.statements = [] sql = "SELECT distinct book_id FROM " + SOURCE_DATABASE + ".bible_verses WHERE hash_id = %s" bookIdList = self.db.selectSet(sql, (hashId, )) if len(bookIdList) == 0: print("WARNING: plain_text filesetId %s has no verses." % (filesetId)) else: otBooks = bookIdList.intersection(self.OT) ntBooks = bookIdList.intersection(self.NT) setSizeCode = UpdateDBPDatabase.getSetSizeCode( ntBooks, otBooks) bucket = TARGET_ASSET_ID setTypeCode = 'text_plain' newHashId = UpdateDBPDatabase.getHashId( bucket, filesetId, setTypeCode) sql = ( "INSERT INTO bible_filesets(id, hash_id, asset_id, set_type_code," " set_size_code, hidden) VALUES (%s, %s, %s, %s, %s, 0)") values = (filesetId, newHashId, bucket, setTypeCode, setSizeCode) self.statements.append((sql, [values])) sql = ( "SELECT book_id, chapter, verse_start, verse_end, verse_text FROM " + SOURCE_DATABASE + ".bible_verses WHERE hash_id = %s") resultSet = self.db.select(sql, (hashId, )) values = [] for row in resultSet: values.append((newHashId, ) + row) sql = ( "INSERT INTO bible_verses (hash_id, book_id, chapter, verse_start, verse_end, verse_text)" " VALUES (%s, %s, %s, %s, %s, %s)") self.statements.append((sql, values)) self.db.executeTransaction(self.statements) def dropIndex(self): sql = "ALTER TABLE bible_verses DROP INDEX verse_text" self.db.execute(sql, ()) def addIndex(self): sql = "ALTER TABLE bible_verses ADD FULLTEXT INDEX verse_text (verse_text)" self.db.execute(sql, ())