Example #1
0
def check_dump_integrity(hostname,
                         port,
                         dbname,
                         username,
                         password,
                         authdb,
                         dump_dir,
                         check_type=COUNT_COLLECTIONS):

    # connect to the db
    mongo = pymongo.MongoClient(hostname, int(port))
    if username and password:
        mongo[authdb].authenticate(username, password)

    db = mongo[dbname]

    # Get a list of relevant collections from the database
    db_collections = db.collection_names()
    db_relevant_collections = [
        match.group(1) for coll in db_collections
        for match in [DB_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match
    ]
    db_relevant_collections.sort()
    db_relevant_collections.sort(key=len)

    #Get a list of relevant collections from the dump
    dump_exists = True
    dump_path = dump_dir + dbname if dump_dir[
        -1:] == '/' else dump_dir + '/' + dbname
    dump_collections = []
    try:
        dump_collections = [file for file in os.listdir(dump_path)]
    except OSError as e:
        if e.errno == 2:
            dump_exists = False
        else:
            logging.error(e)
            print(e)

    dump_relevant_collections = [
        match.group(1) for coll in dump_collections
        for match in [DUMP_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match
    ]
    dump_relevant_collections.sort()
    dump_relevant_collections.sort(key=len)

    #Find out format of dump (.json or .bson)
    dump_format = 'bson'
    if dump_relevant_collections[0].split('.', 1)[1] == 'json':
        dump_format = 'json'

    #CHECK NUMBER OF COLLECTIONS
    if check_type == COUNT_COLLECTIONS:
        num_collections_in_db = len(db_relevant_collections)
        num_collections_dumped = len(dump_relevant_collections)

        #Print integrity of number of collections in dump
        log_output = 'DUMP FOR {} '.format(dbname)
        if not dump_exists:
            log_output += 'DOES NOT EXIST. '
        elif num_collections_dumped < num_collections_in_db:
            log_output += 'IS MISSING COLLECTIONS. '
        elif num_collections_dumped > num_collections_in_db:
            log_output += 'HAS TOO MANY COLLECTIONS. '
        else:
            log_output += 'IS OK ON COLLECTIONS. '
        log_output += 'Number of collections in database: {}, Number of collections dumped: {}'.format(
            num_collections_in_db, num_collections_dumped)
        logging.info('\n' + log_output)
        print('\n' + log_output)

        #Print list of any collections missing from dump
        if dump_exists and (num_collections_dumped < num_collections_in_db):
            dump_relevant_collections_split = [
                dump_coll.split('.' + dump_format, 1)[0]
                for dump_coll in dump_relevant_collections
            ]
            missing_collections = [
                coll for coll in db_relevant_collections
                if coll not in dump_relevant_collections_split
            ]
            missing_collections.sort()
            missing_collections.sort(key=len)
            logging.info('\n' +
                         'Missing Collections: {}'.format(missing_collections))
            print('\n' + 'Missing Collections: {}'.format(missing_collections))

    #CHECK NUMBER OF DOCUMENTS
    elif check_type == COUNT_DOCUMENTS:
        logger.info(
            '\n' +
            'Counting number of documents in {} database'.format(dbname))
        print('\n' +
              'Counting number of documents in {} database'.format(dbname))

        total_documents_in_db = 0
        db_collection_doc_counts = {}

        #Sum total documents in db
        for coll in db_relevant_collections:
            num_docs_in_coll = db[coll].count()
            total_documents_in_db += num_docs_in_coll

            #Save document count for db collection
            db_collection_doc_counts[coll] = num_docs_in_coll

            logging.info("Database {} {} document count: {}".format(
                dbname, coll, num_docs_in_coll))
            print("Database {} {} document count: {}".format(
                dbname, coll, num_docs_in_coll))

        logger.info('\n' +
                    'Counting number of documents in {} dump'.format(dbname))
        print('\n' + 'Counting number of documents in {} dump'.format(dbname))

        total_documents_dumped = 0
        dump_collection_doc_counts = {}

        #Sum up total number of documents in dump
        for coll in dump_relevant_collections:
            collection = SmappCollection(dump_format, dump_path + '/' + coll)
            num_docs_in_coll = collection.count_tweets()
            total_documents_dumped += num_docs_in_coll

            #Save document count for dump collection
            dump_collection_doc_counts[coll.split('.' + dump_format,
                                                  1)[0]] = num_docs_in_coll

            #Calculate # and % missing documents for collection
            num_docs_in_db_coll = db_collection_doc_counts[coll.split(
                '.' + dump_format, 1)[0]]
            num_docs_missing = num_docs_in_db_coll - num_docs_in_coll
            percentage_docs_missing = 0
            if num_docs_in_db_coll != 0:
                percentage_docs_missing = (num_docs_missing /
                                           num_docs_in_db_coll) * 100

            logging.info("Dump {} {} document count: {}".format(
                dbname, coll, num_docs_in_coll))
            print("Dump {0} {1} document count: {2} (Missing {3}, {4:.2f}%)".
                  format(dbname, coll, num_docs_in_coll, num_docs_missing,
                         percentage_docs_missing))
            # print("".format())

        #Calculate # and % missing documents overall
        total_docs_missing = total_documents_in_db - total_documents_dumped
        percentage_total_docs_missing = 0
        if total_documents_in_db != 0:
            percentage_total_docs_missing = (total_docs_missing /
                                             total_documents_in_db) * 100

        #Print integrity of number of documents in dump
        log_output = 'DUMP FOR {} '.format(dbname)
        if not dump_exists:
            log_output += 'DOES NOT EXIST. '
        elif total_documents_dumped < total_documents_in_db:
            log_output += 'IS MISSING DOCUMENTS. '
        elif total_documents_dumped > total_documents_in_db:
            log_output += 'HAS TOO MANY DOCUMENTS. '
        else:
            log_output += 'IS OK ON DOCUMENTS. '
        log_output += 'Total documents in database: {0}, Total documents dumped: {1} (Missing {2}, {3:.2f}%)'.format(
            total_documents_in_db, total_documents_dumped, total_docs_missing,
            percentage_total_docs_missing)
        logging.info('\n' + log_output)
        print('\n' + log_output)

        #Print list of any collections from dump missing documents
        if dump_exists and (total_documents_dumped < total_documents_in_db):
            collections_missing_docs = [
                coll for coll, count in db_collection_doc_counts.items()
                if (coll not in dump_collection_doc_counts
                    or dump_collection_doc_counts[coll] != count)
            ]
            collections_missing_docs.sort()
            collections_missing_docs.sort(key=len)
            logging.info('\n' + 'Collections Missing Documents: {}'.format(
                collections_missing_docs))
            print('\n' + 'Collections Missing Documents: {}'.format(
                collections_missing_docs))
 def test_count_tweets(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                                config['bson']['valid'])
     collection = SmappCollection('bson', file_path)
     count = collection.count_tweets()
     self.assertEqual(1187, count)
def check_dump_integrity(hostname, port, dbname, username, password, authdb, dump_dir, check_type = COUNT_COLLECTIONS):

	# connect to the db
	mongo = pymongo.MongoClient(hostname, int(port))
	if username and password:
		mongo[authdb].authenticate(username, password)

	db = mongo[dbname]

	# Get a list of relevant collections from the database
	db_collections = db.collection_names()
	db_relevant_collections = [match.group(1) for coll in db_collections for match in [DB_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match]
	db_relevant_collections.sort()
	db_relevant_collections.sort(key=len)

	#Get a list of relevant collections from the dump
	dump_exists = True
	dump_path = dump_dir + dbname if dump_dir[-1:] == '/' else dump_dir + '/' + dbname
	dump_collections = []
	try:
		dump_collections = [file for file in os.listdir(dump_path)]
	except OSError as e:
		if e.errno == 2:
			dump_exists = False
		else:
			logging.error(e)
			print(e)

	dump_relevant_collections = [match.group(1) for coll in dump_collections for match in [DUMP_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match]
	dump_relevant_collections.sort()
	dump_relevant_collections.sort(key=len)

	#Find out format of dump (.json or .bson)
	dump_format = 'bson'
	if dump_relevant_collections[0].split('.', 1)[1] == 'json':
		dump_format = 'json'

	#CHECK NUMBER OF COLLECTIONS
	if check_type == COUNT_COLLECTIONS:
		num_collections_in_db = len(db_relevant_collections)
		num_collections_dumped = len(dump_relevant_collections)

		#Print integrity of number of collections in dump
		log_output = 'DUMP FOR {} '.format(dbname)
		if not dump_exists:
			log_output += 'DOES NOT EXIST. '
		elif num_collections_dumped < num_collections_in_db:
			log_output += 'IS MISSING COLLECTIONS. '
		elif num_collections_dumped > num_collections_in_db:
			log_output += 'HAS TOO MANY COLLECTIONS. '
		else:
			log_output += 'IS OK ON COLLECTIONS. '
		log_output += 'Number of collections in database: {}, Number of collections dumped: {}'.format(num_collections_in_db, num_collections_dumped)
		logging.info('\n' + log_output)
		print('\n' + log_output)

		#Print list of any collections missing from dump
		if dump_exists and (num_collections_dumped < num_collections_in_db):
			dump_relevant_collections_split = [dump_coll.split('.' + dump_format, 1)[0] for dump_coll in dump_relevant_collections]
			missing_collections = [coll for coll in db_relevant_collections if coll not in dump_relevant_collections_split]
			missing_collections.sort()
			missing_collections.sort(key=len)
			logging.info('\n' + 'Missing Collections: {}'.format(missing_collections))
			print('\n' + 'Missing Collections: {}'.format(missing_collections))

	#CHECK NUMBER OF DOCUMENTS
	elif check_type == COUNT_DOCUMENTS:
		logger.info('\n' + 'Counting number of documents in {} database'.format(dbname))
		print('\n' + 'Counting number of documents in {} database'.format(dbname))

		total_documents_in_db = 0
		db_collection_doc_counts = {}

		#Sum total documents in db
		for coll in db_relevant_collections:
			num_docs_in_coll = db[coll].count()
			total_documents_in_db += num_docs_in_coll

			#Save document count for db collection
			db_collection_doc_counts[coll] = num_docs_in_coll

			logging.info("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll))
			print("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll))
	
		logger.info('\n' + 'Counting number of documents in {} dump'.format(dbname))
		print('\n' + 'Counting number of documents in {} dump'.format(dbname))

		total_documents_dumped = 0
		dump_collection_doc_counts = {}

		#Sum up total number of documents in dump
		for coll in dump_relevant_collections:
			collection = SmappCollection(dump_format, dump_path + '/' + coll)
			num_docs_in_coll = collection.count_tweets()
			total_documents_dumped += num_docs_in_coll

			#Save document count for dump collection
			dump_collection_doc_counts[coll.split('.' + dump_format, 1)[0]] = num_docs_in_coll

			#Calculate # and % missing documents for collection
			num_docs_in_db_coll = db_collection_doc_counts[coll.split('.' + dump_format, 1)[0]]
			num_docs_missing = num_docs_in_db_coll - num_docs_in_coll
			percentage_docs_missing = 0
			if num_docs_in_db_coll != 0:
				percentage_docs_missing = (num_docs_missing/num_docs_in_db_coll) * 100

			logging.info("Dump {} {} document count: {}".format(dbname, coll, num_docs_in_coll))
			print("Dump {0} {1} document count: {2} (Missing {3}, {4:.2f}%)".format(dbname, coll, num_docs_in_coll, num_docs_missing, percentage_docs_missing))
			# print("".format())

		#Calculate # and % missing documents overall
		total_docs_missing = total_documents_in_db - total_documents_dumped
		percentage_total_docs_missing = 0
		if total_documents_in_db != 0:
			percentage_total_docs_missing = (total_docs_missing/total_documents_in_db) * 100

		#Print integrity of number of documents in dump
		log_output = 'DUMP FOR {} '.format(dbname)
		if not dump_exists:
			log_output += 'DOES NOT EXIST. '
		elif total_documents_dumped < total_documents_in_db:
			log_output += 'IS MISSING DOCUMENTS. '
		elif total_documents_dumped > total_documents_in_db:
			log_output += 'HAS TOO MANY DOCUMENTS. '
		else:
			log_output += 'IS OK ON DOCUMENTS. '
		log_output += 'Total documents in database: {0}, Total documents dumped: {1} (Missing {2}, {3:.2f}%)'.format(total_documents_in_db, total_documents_dumped, total_docs_missing, percentage_total_docs_missing)
		logging.info('\n' + log_output)
		print('\n' + log_output)

		#Print list of any collections from dump missing documents
		if dump_exists and (total_documents_dumped < total_documents_in_db):
			collections_missing_docs = [coll for coll, count in db_collection_doc_counts.items() if (coll not in dump_collection_doc_counts or dump_collection_doc_counts[coll] != count)]
			collections_missing_docs.sort()
			collections_missing_docs.sort(key=len)
			logging.info('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs))
			print('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs))