def check_dump_integrity(hostname, port, dbname, username, password, authdb, dump_dir, check_type=COUNT_COLLECTIONS): # connect to the db mongo = pymongo.MongoClient(hostname, int(port)) if username and password: mongo[authdb].authenticate(username, password) db = mongo[dbname] # Get a list of relevant collections from the database db_collections = db.collection_names() db_relevant_collections = [ match.group(1) for coll in db_collections for match in [DB_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match ] db_relevant_collections.sort() db_relevant_collections.sort(key=len) #Get a list of relevant collections from the dump dump_exists = True dump_path = dump_dir + dbname if dump_dir[ -1:] == '/' else dump_dir + '/' + dbname dump_collections = [] try: dump_collections = [file for file in os.listdir(dump_path)] except OSError as e: if e.errno == 2: dump_exists = False else: logging.error(e) print(e) dump_relevant_collections = [ match.group(1) for coll in dump_collections for match in [DUMP_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match ] dump_relevant_collections.sort() dump_relevant_collections.sort(key=len) #Find out format of dump (.json or .bson) dump_format = 'bson' if dump_relevant_collections[0].split('.', 1)[1] == 'json': dump_format = 'json' #CHECK NUMBER OF COLLECTIONS if check_type == COUNT_COLLECTIONS: num_collections_in_db = len(db_relevant_collections) num_collections_dumped = len(dump_relevant_collections) #Print integrity of number of collections in dump log_output = 'DUMP FOR {} '.format(dbname) if not dump_exists: log_output += 'DOES NOT EXIST. ' elif num_collections_dumped < num_collections_in_db: log_output += 'IS MISSING COLLECTIONS. ' elif num_collections_dumped > num_collections_in_db: log_output += 'HAS TOO MANY COLLECTIONS. ' else: log_output += 'IS OK ON COLLECTIONS. ' log_output += 'Number of collections in database: {}, Number of collections dumped: {}'.format( num_collections_in_db, num_collections_dumped) logging.info('\n' + log_output) print('\n' + log_output) #Print list of any collections missing from dump if dump_exists and (num_collections_dumped < num_collections_in_db): dump_relevant_collections_split = [ dump_coll.split('.' + dump_format, 1)[0] for dump_coll in dump_relevant_collections ] missing_collections = [ coll for coll in db_relevant_collections if coll not in dump_relevant_collections_split ] missing_collections.sort() missing_collections.sort(key=len) logging.info('\n' + 'Missing Collections: {}'.format(missing_collections)) print('\n' + 'Missing Collections: {}'.format(missing_collections)) #CHECK NUMBER OF DOCUMENTS elif check_type == COUNT_DOCUMENTS: logger.info( '\n' + 'Counting number of documents in {} database'.format(dbname)) print('\n' + 'Counting number of documents in {} database'.format(dbname)) total_documents_in_db = 0 db_collection_doc_counts = {} #Sum total documents in db for coll in db_relevant_collections: num_docs_in_coll = db[coll].count() total_documents_in_db += num_docs_in_coll #Save document count for db collection db_collection_doc_counts[coll] = num_docs_in_coll logging.info("Database {} {} document count: {}".format( dbname, coll, num_docs_in_coll)) print("Database {} {} document count: {}".format( dbname, coll, num_docs_in_coll)) logger.info('\n' + 'Counting number of documents in {} dump'.format(dbname)) print('\n' + 'Counting number of documents in {} dump'.format(dbname)) total_documents_dumped = 0 dump_collection_doc_counts = {} #Sum up total number of documents in dump for coll in dump_relevant_collections: collection = SmappCollection(dump_format, dump_path + '/' + coll) num_docs_in_coll = collection.count_tweets() total_documents_dumped += num_docs_in_coll #Save document count for dump collection dump_collection_doc_counts[coll.split('.' + dump_format, 1)[0]] = num_docs_in_coll #Calculate # and % missing documents for collection num_docs_in_db_coll = db_collection_doc_counts[coll.split( '.' + dump_format, 1)[0]] num_docs_missing = num_docs_in_db_coll - num_docs_in_coll percentage_docs_missing = 0 if num_docs_in_db_coll != 0: percentage_docs_missing = (num_docs_missing / num_docs_in_db_coll) * 100 logging.info("Dump {} {} document count: {}".format( dbname, coll, num_docs_in_coll)) print("Dump {0} {1} document count: {2} (Missing {3}, {4:.2f}%)". format(dbname, coll, num_docs_in_coll, num_docs_missing, percentage_docs_missing)) # print("".format()) #Calculate # and % missing documents overall total_docs_missing = total_documents_in_db - total_documents_dumped percentage_total_docs_missing = 0 if total_documents_in_db != 0: percentage_total_docs_missing = (total_docs_missing / total_documents_in_db) * 100 #Print integrity of number of documents in dump log_output = 'DUMP FOR {} '.format(dbname) if not dump_exists: log_output += 'DOES NOT EXIST. ' elif total_documents_dumped < total_documents_in_db: log_output += 'IS MISSING DOCUMENTS. ' elif total_documents_dumped > total_documents_in_db: log_output += 'HAS TOO MANY DOCUMENTS. ' else: log_output += 'IS OK ON DOCUMENTS. ' log_output += 'Total documents in database: {0}, Total documents dumped: {1} (Missing {2}, {3:.2f}%)'.format( total_documents_in_db, total_documents_dumped, total_docs_missing, percentage_total_docs_missing) logging.info('\n' + log_output) print('\n' + log_output) #Print list of any collections from dump missing documents if dump_exists and (total_documents_dumped < total_documents_in_db): collections_missing_docs = [ coll for coll, count in db_collection_doc_counts.items() if (coll not in dump_collection_doc_counts or dump_collection_doc_counts[coll] != count) ] collections_missing_docs.sort() collections_missing_docs.sort(key=len) logging.info('\n' + 'Collections Missing Documents: {}'.format( collections_missing_docs)) print('\n' + 'Collections Missing Documents: {}'.format( collections_missing_docs))
def test_count_tweets(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) count = collection.count_tweets() self.assertEqual(1187, count)
def check_dump_integrity(hostname, port, dbname, username, password, authdb, dump_dir, check_type = COUNT_COLLECTIONS): # connect to the db mongo = pymongo.MongoClient(hostname, int(port)) if username and password: mongo[authdb].authenticate(username, password) db = mongo[dbname] # Get a list of relevant collections from the database db_collections = db.collection_names() db_relevant_collections = [match.group(1) for coll in db_collections for match in [DB_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match] db_relevant_collections.sort() db_relevant_collections.sort(key=len) #Get a list of relevant collections from the dump dump_exists = True dump_path = dump_dir + dbname if dump_dir[-1:] == '/' else dump_dir + '/' + dbname dump_collections = [] try: dump_collections = [file for file in os.listdir(dump_path)] except OSError as e: if e.errno == 2: dump_exists = False else: logging.error(e) print(e) dump_relevant_collections = [match.group(1) for coll in dump_collections for match in [DUMP_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match] dump_relevant_collections.sort() dump_relevant_collections.sort(key=len) #Find out format of dump (.json or .bson) dump_format = 'bson' if dump_relevant_collections[0].split('.', 1)[1] == 'json': dump_format = 'json' #CHECK NUMBER OF COLLECTIONS if check_type == COUNT_COLLECTIONS: num_collections_in_db = len(db_relevant_collections) num_collections_dumped = len(dump_relevant_collections) #Print integrity of number of collections in dump log_output = 'DUMP FOR {} '.format(dbname) if not dump_exists: log_output += 'DOES NOT EXIST. ' elif num_collections_dumped < num_collections_in_db: log_output += 'IS MISSING COLLECTIONS. ' elif num_collections_dumped > num_collections_in_db: log_output += 'HAS TOO MANY COLLECTIONS. ' else: log_output += 'IS OK ON COLLECTIONS. ' log_output += 'Number of collections in database: {}, Number of collections dumped: {}'.format(num_collections_in_db, num_collections_dumped) logging.info('\n' + log_output) print('\n' + log_output) #Print list of any collections missing from dump if dump_exists and (num_collections_dumped < num_collections_in_db): dump_relevant_collections_split = [dump_coll.split('.' + dump_format, 1)[0] for dump_coll in dump_relevant_collections] missing_collections = [coll for coll in db_relevant_collections if coll not in dump_relevant_collections_split] missing_collections.sort() missing_collections.sort(key=len) logging.info('\n' + 'Missing Collections: {}'.format(missing_collections)) print('\n' + 'Missing Collections: {}'.format(missing_collections)) #CHECK NUMBER OF DOCUMENTS elif check_type == COUNT_DOCUMENTS: logger.info('\n' + 'Counting number of documents in {} database'.format(dbname)) print('\n' + 'Counting number of documents in {} database'.format(dbname)) total_documents_in_db = 0 db_collection_doc_counts = {} #Sum total documents in db for coll in db_relevant_collections: num_docs_in_coll = db[coll].count() total_documents_in_db += num_docs_in_coll #Save document count for db collection db_collection_doc_counts[coll] = num_docs_in_coll logging.info("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll)) print("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll)) logger.info('\n' + 'Counting number of documents in {} dump'.format(dbname)) print('\n' + 'Counting number of documents in {} dump'.format(dbname)) total_documents_dumped = 0 dump_collection_doc_counts = {} #Sum up total number of documents in dump for coll in dump_relevant_collections: collection = SmappCollection(dump_format, dump_path + '/' + coll) num_docs_in_coll = collection.count_tweets() total_documents_dumped += num_docs_in_coll #Save document count for dump collection dump_collection_doc_counts[coll.split('.' + dump_format, 1)[0]] = num_docs_in_coll #Calculate # and % missing documents for collection num_docs_in_db_coll = db_collection_doc_counts[coll.split('.' + dump_format, 1)[0]] num_docs_missing = num_docs_in_db_coll - num_docs_in_coll percentage_docs_missing = 0 if num_docs_in_db_coll != 0: percentage_docs_missing = (num_docs_missing/num_docs_in_db_coll) * 100 logging.info("Dump {} {} document count: {}".format(dbname, coll, num_docs_in_coll)) print("Dump {0} {1} document count: {2} (Missing {3}, {4:.2f}%)".format(dbname, coll, num_docs_in_coll, num_docs_missing, percentage_docs_missing)) # print("".format()) #Calculate # and % missing documents overall total_docs_missing = total_documents_in_db - total_documents_dumped percentage_total_docs_missing = 0 if total_documents_in_db != 0: percentage_total_docs_missing = (total_docs_missing/total_documents_in_db) * 100 #Print integrity of number of documents in dump log_output = 'DUMP FOR {} '.format(dbname) if not dump_exists: log_output += 'DOES NOT EXIST. ' elif total_documents_dumped < total_documents_in_db: log_output += 'IS MISSING DOCUMENTS. ' elif total_documents_dumped > total_documents_in_db: log_output += 'HAS TOO MANY DOCUMENTS. ' else: log_output += 'IS OK ON DOCUMENTS. ' log_output += 'Total documents in database: {0}, Total documents dumped: {1} (Missing {2}, {3:.2f}%)'.format(total_documents_in_db, total_documents_dumped, total_docs_missing, percentage_total_docs_missing) logging.info('\n' + log_output) print('\n' + log_output) #Print list of any collections from dump missing documents if dump_exists and (total_documents_dumped < total_documents_in_db): collections_missing_docs = [coll for coll, count in db_collection_doc_counts.items() if (coll not in dump_collection_doc_counts or dump_collection_doc_counts[coll] != count)] collections_missing_docs.sort() collections_missing_docs.sort(key=len) logging.info('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs)) print('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs))