class MongoTest(unittest.TestCase): ''' Test the mongodb.py module. Uses a library that fakes a MongoDB server. See https://pypi.python.org/pypi/mongomock/1.0.1 ''' def setUp(self): self.objs = [{ "fname": "Franco", "lname": "Corelli" }, { "fname": "Leonardo", "lname": "DaVinci", "age": 300 }, { "fname": "Franco", "lname": "Gandolpho" }] self.mongodb = MongoDB(dbName='unittest', collection='unittest') self.mongodb.clearCollection(collection="unittest") self.mongodb.clearCollection(collection="new_coll") self.mongodb.setCollection("unittest") def tearDown(self): self.mongodb.dropCollection(collection='unittest') self.mongodb.dropCollection(collection='new_coll') self.mongodb.close() @unittest.skipIf(not TEST_ALL, "Skipping") def test_update_and_find_one(self): self.mongodb.insert(self.objs[0]) # Get a generator for the results: resGen = self.mongodb.query({"fname": "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual( 'Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_set_coll_use_different_coll(self): # Insert into unittest: self.mongodb.insert(self.objs[0]) # Switch to new_coll: self.mongodb.setCollection('new_coll') self.mongodb.insert({"recommendation": "Hawaii"}) # We're in new_coll; the following should be empty result: self.mongodb.query({"fname": "Franco"}, limit=1) resCount = self.mongodb.resultCount({"fname": "Franco"}) self.assertIsNone( resCount, "Got non-null result that should be null: %s" % resCount) # But this search is within new_coll, and should succeed: resGen = self.mongodb.query({"recommendation": { '$regex': '.*' }}, limit=1) res = resGen.next() self.assertEqual( 'Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) # Try inline collection switch: resGen = self.mongodb.query({"fname": "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual( 'Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) # But the default collection should still be new_coll, # so a search with unspecified coll should be in new_coll: resGen = self.mongodb.query({"recommendation": { '$regex': '.*' }}, limit=1) res = resGen.next() self.assertEqual( 'Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_multi_result(self): # Insert two docs with fname == Franco: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[2]) resGen = self.mongodb.query({"fname": "Franco"}) # To get result count, must retrieve at least one result first: resGen.next() resCount = self.mongodb.resultCount({"fname": "Franco"}) if resCount != 2: self.fail("Added two Franco objects, but only %s are found." % str(resCount)) @unittest.skipIf(not TEST_ALL, "Skipping") def test_clear_collection(self): self.mongodb.insert({"foo": 10}) resGen = self.mongodb.query({"foo": 10}, limit=1) res = resGen.next() self.assertIsNotNone(res, "Did not find document that was just inserted.") self.mongodb.clearCollection() resGen = self.mongodb.query({"foo": 10}, limit=1) self.assertRaises(StopIteration, resGen.next) @unittest.skipIf(not TEST_ALL, "Skipping") def test_only_some_return_columns(self): # Also tests the suppression of _id col when desired: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[1]) resGen = self.mongodb.query({}, ("lname")) names = [] for lnameDict in resGen: resCount = self.mongodb.resultCount({}) self.assertEqual(2, resCount) names.append(lnameDict['lname']) self.assertItemsEqual(['Corelli', 'DaVinci'], names, "Did not receive expected lnames: %s" % str(names))
class TestForumEtl(unittest.TestCase): # Forum rows have the following columns: # type, anonymous, anonymous_to_peers, at_position_list, user_int_id, body, course_display_name, created_at, votes, count, down_count, up_count, up, down, comment_thread_id, parent_id, parent_ids, sk # Correct result for relationization of tinyForum.json # (in <projDir>/src/forum_etl/data). This result is anonymized and not relatable, # i.e. poster name UIDs use integers, while other tables use hashes: tinyForumGoldAnonymized = \ [ # poster Otto van Homberg: body is clean to start with: ('anon_screen_name_redacted','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for anonymization that allows relating to other tables (i.e. hashes are constant) tinyForumGoldRelatable = \ [ # poster Otto van Homberg: body is clean to start with: ('abc','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('def','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('ghi','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> <nameRedac_abc> <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for non-anonymized forum: tinyForumGoldClear = \ [ # poster Otto van Homberg: body is clean to start with: ('otto_king','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('fritzL','Comment', 'False', 'False', '[]', 7L, ' Body with [email protected] email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster name Otto embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('bebeW','Comment', 'False', 'False', '[]', 10L, 'Body with 650-333-4567 a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name otto_king embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name Otto van Homberg embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] def setUp(self): self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum") # Fill the little MongoDB with test JSON lines self.resetMongoTestDb() self.mysqldb = MySQLDB(mySQLUser='******', db='unittest') # Start with an empty result MySQL table for each test: self.mysqldb.dropTable('contents') # Fill the fake UserGrade table with records of course participants: self.resetMySQLUserListDb() # Instantiate a Forum scrubber without the # name of a bson file that contains forum # records. That 'None' for the bson file will # make the class understand that it's being # instantiated for a unit test. self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade') self.forumScrubberRelatable = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True) self.forumScrubberClear = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False) def tearDown(self): self.mysqldb.close() @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testAnonymized(self): self.forumScrubberAnonymized.populateUserCache() self.forumScrubberAnonymized.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldAnonymized[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymizedRelatable(self): self.forumScrubberRelatable.populateUserCache() self.forumScrubberRelatable.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldRelatable[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymized(self): self.forumScrubberClear.populateUserCache() self.forumScrubberClear.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldClear[rowNum], forumPost) def resetMongoTestDb(self): self.mongoDb.clearCollection() # Use small, known forum collection: currDir = os.path.dirname(__file__) with open(os.path.join(currDir, 'data/tinyForum.json'), 'r') as jsonFd: for line in jsonFd: forumPost = json.loads(line) self.mongoDb.insert(forumPost) def resetMySQLUserListDb(self): ''' Prepare a MySQL table that mimicks EdxPrivate.UserGrade. ''' userGradeColSpecs = OrderedDict( { 'name' : 'varchar(255)', 'screen_name' : 'varchar(255)', 'grade' : 'int', 'course_id' : 'varchar(255)', 'distinction' : 'tinyint', 'status' : 'varchar(50)', 'user_int_id' : 'int(11)', 'anon_screen_name' : 'varchar(40)' }) self.mysqldb.dropTable('UserGrade') self.mysqldb.createTable('UserGrade', userGradeColSpecs) self.mysqldb.bulkInsert('UserGrade', ('name','screen_name','grade','course_id','distinction','status','user_int_id','anon_screen_name'), [ ('Otto van Homberg','otto_king',5,'oldCourse',0,'notpassing',5,'abc'), ('Andreas Fritz','fritzL',2,'newCourse',0,'notpassing',7,'def'), ('Bebe Winter', 'bebeW',10,'History of Baking',1,'passing',10,'ghi') ])
class TestForumEtl(unittest.TestCase): # Forum rows have the following columns: # type, anonymous, anonymous_to_peers, at_position_list, user_int_id, body, course_display_name, created_at, votes, count, down_count, up_count, up, down, comment_thread_id, parent_id, parent_ids, sk # Correct result for relationization of tinyForum.json # (in <projDir>/src/forum_etl/data). This result is anonymized and not relatable, # i.e. poster name UIDs use integers, while other tables use hashes: tinyForumGoldAnonymized = \ [ # poster Otto van Homberg: body is clean to start with: ('anon_screen_name_redacted','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for anonymization that allows relating to other tables (i.e. hashes are constant) tinyForumGoldRelatable = \ [ # poster Otto van Homberg: body is clean to start with: ('abc','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('def','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('ghi','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> <nameRedac_abc> <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for non-anonymized forum: tinyForumGoldClear = \ [ # poster Otto van Homberg: body is clean to start with: ('otto_king','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('fritzL','Comment', 'False', 'False', '[]', 7L, ' Body with [email protected] email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster name Otto embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('bebeW','Comment', 'False', 'False', '[]', 10L, 'Body with 650-333-4567 a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name otto_king embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name Otto van Homberg embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] def setUp(self): self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum") # Fill the little MongoDB with test JSON lines self.resetMongoTestDb() self.mysqldb = MySQLDB(user='******', db='unittest') # Start with an empty result MySQL table for each test: self.mysqldb.dropTable('contents') # Fill the fake UserGrade table with records of course participants: self.resetMySQLUserListDb() # Instantiate a Forum scrubber without the # name of a bson file that contains forum # records. That 'None' for the bson file will # make the class understand that it's being # instantiated for a unit test. self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade') self.forumScrubberRelatable = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True) self.forumScrubberClear = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False) def tearDown(self): self.mysqldb.close() @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testAnonymized(self): self.forumScrubberAnonymized.populateUserCache() self.forumScrubberAnonymized.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldAnonymized[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymizedRelatable(self): self.forumScrubberRelatable.populateUserCache() self.forumScrubberRelatable.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldRelatable[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymized(self): self.forumScrubberClear.populateUserCache() self.forumScrubberClear.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldClear[rowNum], forumPost) def resetMongoTestDb(self): self.mongoDb.clearCollection() # Use small, known forum collection: currDir = os.path.dirname(__file__) with open(os.path.join(currDir, 'data/tinyForum.json'), 'r') as jsonFd: for line in jsonFd: forumPost = json.loads(line) self.mongoDb.insert(forumPost) def resetMySQLUserListDb(self): ''' Prepare a MySQL table that mimicks EdxPrivate.UserGrade. ''' userGradeColSpecs = OrderedDict( { 'name' : 'varchar(255)', 'screen_name' : 'varchar(255)', 'grade' : 'int', 'course_id' : 'varchar(255)', 'distinction' : 'tinyint', 'status' : 'varchar(50)', 'user_int_id' : 'int(11)', 'anon_screen_name' : 'varchar(40)' }) self.mysqldb.dropTable('UserGrade') self.mysqldb.createTable('UserGrade', userGradeColSpecs) self.mysqldb.bulkInsert('UserGrade', ('name','screen_name','grade','course_id','distinction','status','user_int_id','anon_screen_name'), [ ('Otto van Homberg','otto_king',5,'oldCourse',0,'notpassing',5,'abc'), ('Andreas Fritz','fritzL',2,'newCourse',0,'notpassing',7,'def'), ('Bebe Winter', 'bebeW',10,'History of Baking',1,'passing',10,'ghi') ])
class MongoTest(unittest.TestCase): ''' Test the mongodb.py module. Uses a library that fakes a MongoDB server. See https://pypi.python.org/pypi/mongomock/1.0.1 ''' def setUp(self): self.objs = [{"fname" : "Franco", "lname" : "Corelli"}, {"fname" : "Leonardo", "lname" : "DaVinci", "age" : 300}, {"fname" : "Franco", "lname" : "Gandolpho"}] self.mongodb = MongoDB(dbName='unittest', collection='unittest') self.mongodb.clearCollection(collection="unittest") self.mongodb.clearCollection(collection="new_coll") self.mongodb.setCollection("unittest") def tearDown(self): self.mongodb.dropCollection(collection='unittest') self.mongodb.dropCollection(collection='new_coll') self.mongodb.close() @unittest.skipIf(not TEST_ALL, "Skipping") def test_update_and_find_one(self): self.mongodb.insert(self.objs[0]) # Get a generator for the results: resGen = self.mongodb.query({"fname" : "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual('Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_set_coll_use_different_coll(self): # Insert into unittest: self.mongodb.insert(self.objs[0]) # Switch to new_coll: self.mongodb.setCollection('new_coll') self.mongodb.insert({"recommendation" : "Hawaii"}) # We're in new_coll; the following should be empty result: self.mongodb.query({"fname" : "Franco"}, limit=1) resCount = self.mongodb.resultCount({"fname" : "Franco"}) self.assertIsNone(resCount, "Got non-null result that should be null: %s" % resCount) # But this search is within new_coll, and should succeed: resGen = self.mongodb.query({"recommendation" : {'$regex' : '.*'}}, limit=1) res = resGen.next() self.assertEqual('Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) # Try inline collection switch: resGen = self.mongodb.query({"fname" : "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual('Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) # But the default collection should still be new_coll, # so a search with unspecified coll should be in new_coll: resGen = self.mongodb.query({"recommendation" : {'$regex' : '.*'}}, limit=1) res = resGen.next() self.assertEqual('Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_multi_result(self): # Insert two docs with fname == Franco: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[2]) resGen = self.mongodb.query({"fname" : "Franco"}) # To get result count, must retrieve at least one result first: resGen.next() resCount = self.mongodb.resultCount({"fname" : "Franco"}) if resCount != 2: self.fail("Added two Franco objects, but only %s are found." % str(resCount)) @unittest.skipIf(not TEST_ALL, "Skipping") def test_clear_collection(self): self.mongodb.insert({"foo" : 10}) resGen = self.mongodb.query({"foo" : 10}, limit=1) res = resGen.next() self.assertIsNotNone(res, "Did not find document that was just inserted.") self.mongodb.clearCollection() resGen = self.mongodb.query({"foo" : 10}, limit=1) self.assertRaises(StopIteration, resGen.next) @unittest.skipIf(not TEST_ALL, "Skipping") def test_only_some_return_columns(self): # Also tests the suppression of _id col when desired: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[1]) resGen = self.mongodb.query({}, ("lname")) names = [] for lnameDict in resGen: resCount = self.mongodb.resultCount({}) self.assertEqual(2, resCount) names.append(lnameDict['lname']) self.assertItemsEqual(['Corelli','DaVinci'], names, "Did not receive expected lnames: %s" % str(names))