def setUp(self): self.objs = [{"fname" : "Franco", "lname" : "Corelli"}, {"fname" : "Leonardo", "lname" : "DaVinci", "age" : 300}, {"fname" : "Franco", "lname" : "Gandolpho"}] self.mongodb = MongoDB(dbName='unittest', collection='unittest') self.mongodb.clearCollection(collection="unittest") self.mongodb.clearCollection(collection="new_coll") self.mongodb.setCollection("unittest")
def setUp(self): self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum") # Fill the little MongoDB with test JSON lines self.resetMongoTestDb() self.mysqldb = MySQLDB(mySQLUser='******', db='unittest') # Start with an empty result MySQL table for each test: self.mysqldb.dropTable('contents') # Fill the fake UserGrade table with records of course participants: self.resetMySQLUserListDb() # Instantiate a Forum scrubber without the # name of a bson file that contains forum # records. That 'None' for the bson file will # make the class understand that it's being # instantiated for a unit test. self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade') self.forumScrubberRelatable = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True) self.forumScrubberClear = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False)
def runConversion(self): ''' Do the actual work. We don't call this method from __init__() so that unittests can create an EdxForumScrubber instance without doing the actual work. Instead, unittests call individual methods. ''' self.populateUserCache(); self.mongo_database_name = 'TmpForum' self.collection_name = 'contents' # Load bson file into Mongodb: self.loadForumIntoMongoDb(self.bsonFileName) self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name) # Anonymize each forum record, and transfer to MySQL db: self.forumMongoToRelational(self.mongodb, self.mydb,'contents' ) self.mydb.close() self.mongodb.close() self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + self.forumTableName))
def setUp(self): self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum") # Fill the little MongoDB with test JSON lines self.resetMongoTestDb() self.mysqldb = MySQLDB(user='******', db='unittest') # Start with an empty result MySQL table for each test: self.mysqldb.dropTable('contents') # Fill the fake UserGrade table with records of course participants: self.resetMySQLUserListDb() # Instantiate a Forum scrubber without the # name of a bson file that contains forum # records. That 'None' for the bson file will # make the class understand that it's being # instantiated for a unit test. self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade') self.forumScrubberRelatable = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True) self.forumScrubberClear = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False)
def runConversion(self): ''' Do the actual work. We don't call this method from __init__() so that unittests can create an EdxForumScrubber instance without doing the actual work. Instead, unittests call individual methods. ''' self.populateUserCache(); self.mongo_database_name = 'TmpForum' self.collection_name = 'contents' # Load bson file into Mongodb: self.loadForumIntoMongoDb(self.bsonFileName) self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name) # Anonymize each forum record, and transfer to MySQL db: self.forumMongoToRelational(self.mongodb, self.mydb,'contents' ) self.mydb.close() self.mongodb.close() self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + '.' + self.forumTableName))
class EdxForumScrubber(object): ''' Given a .bson file of OpenEdX Forum posts, load the file into a MongoDB. Then pull a post at a time, anonymize, and insert a selection of fields into a MySQL db. The MongoDb entries look like this:: { "_id" : ObjectId("51b75a48f359c40a00000028"), "_type" : "Comment", "abuse_flaggers" : [ ], "anonymous" : false, "anonymous_to_peers" : false, "at_position_list" : [ ], "author_id" : "26344", "author_username" : "Minelly48", "body" : "I am Gwen.I am a nursing professor who took statistics many years ago and want to refresh my knowledge.", "comment_thread_id" : ObjectId("51b754e5f359c40a0000001d"), "course_id" : "Medicine/HRP258/Statistics_in_Medicine", "created_at" : ISODate("2013-06-11T17:11:36.831Z"), "endorsed" : false, "historical_abuse_flaggers" : [ ], "parent_ids" : [ ], "updated_at" : ISODate("2013-06-11T17:11:36.831Z"), "visible" : true, "votes" : { "count" : 2, "down" : [ ], "down_count" : 0, "point" : 2, "up" : [ "40325", "20323" ], "up_count" : 2 }, "sk" : "51b75a48f359c40a00000028" } Depending on parameter allowAnonScreenName in the __init__() method, forum entries in the relational database will be associated with the same hash that is used to anonymize other parts of the OpenEdX data. ''' LOG_DIR = '/home/dataman/Data/EdX/NonTransformLogs' # Pattern for email id - strings of alphabets/numbers/dots/hyphens followed # by an @ or at followed by combinations of dot/. followed by the edu/com # also, allow for spaces emailPattern='(.*)\s+([a-zA-Z0-9\(\.\-]+)[@]([a-zA-Z0-9\.]+)(.)(edu|com)\\s*(.*)' #emailPattern='(.*)\\s+([a-zA-Z0-9\\.]+)\\s*(\\(f.*b.*)?(@)\\s*([a-zA-Z0-9\\.\\s;]+)\\s*(\\.)\\s*(edu|com)\\s+(.*)' compiledEmailPattern = re.compile(emailPattern); # Pattern for replacing embedded double quotes in post bodies, # unless they are already escaped w/ a backslash. The # {0,1} means a match if zero or one repetition. It's # needed so that double quotes at the very start of a # string are matched: no preceding character at all: #doublQuoteReplPattern = re.compile(r'[^\\]{0,1}"') doublQuoteReplPattern = re.compile(r'[\\]{0,}"') # Schema of EdxForum.contents: an ordered dict that is # used twice: the table creation MySQL command is constructed # from this dict, and the dict is used to ensure that # all its keys (i.e. future column names) are present # in each MongoDB object. See also createForumTable(). # In createForumTable() either entry anon_screen_name, # or screen_name in the dict below will be deleted, based # on whether we are asked to anonymize or not: forumSchema = OrderedDict({}) forumSchema['forum_post_id'] = "varchar(40) NOT NULL DEFAULT 'unavailable'" forumSchema['anon_screen_name'] = "varchar(40) NOT NULL DEFAULT 'anon_screen_name_redacted'" # This or next deleted based on anonymize yes/no forumSchema['screen_name'] = "varchar(40) NOT NULL DEFAULT 'anon_screen_name_redacted'" # This or prev deleted based on anonymize yes/no forumSchema['type'] = "varchar(20) NOT NULL" forumSchema['anonymous'] = "varchar(10) NOT NULL" forumSchema['anonymous_to_peers'] = "varchar(10) NOT NULL" forumSchema['at_position_list'] = "varchar(200) NOT NULL" forumSchema['forum_uid'] = "varchar(40) NOT NULL" forumSchema['body'] = "TEXT NOT NULL" #"varchar(2500) NOT NULL" forumSchema['course_display_name'] = "varchar(100) NOT NULL" forumSchema['created_at'] = "datetime NOT NULL" forumSchema['votes'] = "TEXT NOT NULL" # "varchar(200) NOT NULL" forumSchema['count'] = "int(11) NOT NULL" forumSchema['down_count'] = "int(11) NOT NULL" forumSchema['up_count'] = "int(11) NOT NULL" forumSchema['up'] = "varchar(200) DEFAULT NULL" forumSchema['down'] = "varchar(200) DEFAULT NULL" forumSchema['comment_thread_id'] = "varchar(255) DEFAULT NULL" forumSchema['parent_id'] = "varchar(255) DEFAULT NULL" forumSchema['parent_ids'] = "varchar(255) DEFAULT NULL" forumSchema['sk'] = "varchar(255) DEFAULT NULL" forumSchema['confusion'] = "varchar(20) NOT NULL DEFAULT ''" forumSchema['happiness'] = "varchar(20) NOT NULL DEFAULT ''" def __init__(self, bsonFileName, mysqlDbObj=None, forumTableName='contents', allUsersTableName='EdxPrivate.UserGrade', anonymize=True, allowAnonScreenName=False): ''' Given a .bson file containing OpenEdX Forum entries, anonymize the entries (if desired), and place them into a MySQL table. :param bsonFileName: full path the .bson table. Set to None if instantiating for unit testing. :type bsonFileName: String :param mysqlDbObj: a pymysql_utils.MySQLDB object where anonymized entries are to be placed. If None, a new such object is created into MySQL db 'EdxForum' :type mysqlDbObj: MySQLDB :param forumTableName: name of table into which anonymized Forum entries are to be placed :type forumTableName: String :param allUsersTable: fully qualified name of table listing all in-the-clear mySQLUser names of users who post to the Forum. Used to redact their names from their own posts. :type allUsersTable: String :param anonymize: If true, Forum post entries in the MySQL table will be anonymized :type anonymize: bool :param allow_anon_screen_name: if True, then occurrences of poster's name in post bodies are replaced by <redacName_<anon_screen_name>>, where anon_screen_name is the hash used in other tables of the OpenEdX data. :type allow_anon_screen_name: Bool ''' self.bsonFileName = bsonFileName self.forumTableName = forumTableName self.forumDbName = 'EdxForum' self.allUsersTableName = allUsersTableName self.anonymize = anonymize self.allowAnonScreenName = allowAnonScreenName # If not unittest, but regular run, then mysqlDbObj is None if mysqlDbObj is None: self.mysql_passwd = self.getMySQLPasswd() self.mysql_dbhost ='localhost' self.mysql_user = getpass.getuser() # mySQLUser that started this process self.mydb = MySQLDB(user=self.mysql_user, passwd=self.mysql_passwd, db=self.forumDbName) else: self.mydb = mysqlDbObj self.counter=0 self.userCache = {} self.userSet = set() warnings.filterwarnings('ignore', category=MySQLdb.Warning) self.setupLogging() self.prepDatabase() #******mysqldb.commit(); #******logging.info('commit completed!') def runConversion(self): ''' Do the actual work. We don't call this method from __init__() so that unittests can create an EdxForumScrubber instance without doing the actual work. Instead, unittests call individual methods. ''' self.populateUserCache(); self.mongo_database_name = 'TmpForum' self.collection_name = 'contents' # Load bson file into Mongodb: self.loadForumIntoMongoDb(self.bsonFileName) self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name) # Anonymize each forum record, and transfer to MySQL db: self.forumMongoToRelational(self.mongodb, self.mydb,'contents' ) self.mydb.close() self.mongodb.close() self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + '.' + self.forumTableName)) def loadForumIntoMongoDb(self, bsonFilename): mongoclient = MongoClient(); db = mongoclient[self.mongo_database_name]; # Get collection object: collection = db[self.collection_name]; # Clear out any old forum entries: self.logInfo('Preparing to delete the collection ') collection.remove() self.logInfo('Deleting mongo collection completed. Will now attempt a mongo restore') self.logInfo('Spawning subprocess to execute mongo restore') with open(self.logFilePath,'w') as outfile: ret = subprocess.call( ['mongorestore', '--drop', '--db', self.mongo_database_name, '--collection', self.collection_name, bsonFilename], stdout=outfile, stderr=outfile) self.logDebug('Return value from mongorestore is %s' % (ret)) objCount = subprocess.check_output( ['mongo', '--quiet', '--eval', 'printjson(db.contents.count())', self.mongo_database_name, ], stderr=outfile) self.numMongoItems = objCount self.logInfo('Available Forum posts %s' % objCount) def forumMongoToRelational(self, mongodb, mysqlDbObj, mysqlTable): ''' Given a pymongo collection object in which Forum posts are stored, and a MySQL db object and table name, anonymize each mongo record, and insert it into the MySQL table. :param collection: collection object obtained via a mangoclient object :type collection: Collection :param mysqlDbObj: wrapper to MySQL db. See pymysql_utils.py :type mysqlDbObj: MYSQLDB :param mysqlTable: name of table where posts are to be deposited. Example: 'contents'. :type mysqlTable: String ''' #command = 'mongorestore %s -db %s -mongoForumRec %s'%(self.bson_filename,self.mongo_database_name,self.collection_name) #print command self.logInfo('Will start inserting from mongo collection to MySQL') for mongoForumRec in mongodb.query({}): mongoRecordObj = MongoRecord(mongoForumRec) try: # Check whether 'up' can be converted to a list list(mongoRecordObj['up']) except Exception as e: self.logInfo("Error in conversion of 'up' field to a list (setting cell to -1):" + `e`) mongoRecordObj['up'] ='-1' # Make sure the MongoDB object has all fields that will # be needed for the forum schema: self.ensureSchemaAdherence(mongoRecordObj) self.insert_content_record(mysqlDbObj, mysqlTable, mongoRecordObj); def prepDatabase(self): ''' Declare variables and execute statements preparing the database to configure options - e.g.: setting char set to utf, connection type to utf truncating the already existing table. ''' try: self.logDebug("Setting and assigning char set for mysqld. will truncate old values") self.mydb.execute('SET NAMES utf8;'); self.mydb.execute('SET CHARACTER SET utf8;'); self.mydb.execute('SET character_set_connection=utf8;'); # Compose fully qualified table name from the db name to # which self.mydb is connected, and the forum table name # that was established in __init__(): fullTblName = self.mydb.dbName() + '.' + self.forumTableName # Clear old forum data out of the table: try: self.mydb.dropTable(fullTblName) # Create MySQL table for the posts. If we are to # anonymize, the poster name column will be 'screen_name', # else it will be 'anon_screen_name': self.createForumTable(self.anonymize) self.logDebug("setting and assigning char set complete. Truncation succeeded") except ValueError as e: self.logDebug("Failed either to set character codes, or to create forum table %s: %s" % (fullTblName, `e`)) except MySQLdb.Error,e: self.logInfo("MySql Error exiting %d: %s" % (e.args[0],e.args[1])) # print e sys.exit(1)
class MongoTest(unittest.TestCase): ''' Test the mongodb.py module. Uses a library that fakes a MongoDB server. See https://pypi.python.org/pypi/mongomock/1.0.1 ''' def setUp(self): self.objs = [{ "fname": "Franco", "lname": "Corelli" }, { "fname": "Leonardo", "lname": "DaVinci", "age": 300 }, { "fname": "Franco", "lname": "Gandolpho" }] self.mongodb = MongoDB(dbName='unittest', collection='unittest') self.mongodb.clearCollection(collection="unittest") self.mongodb.clearCollection(collection="new_coll") self.mongodb.setCollection("unittest") def tearDown(self): self.mongodb.dropCollection(collection='unittest') self.mongodb.dropCollection(collection='new_coll') self.mongodb.close() @unittest.skipIf(not TEST_ALL, "Skipping") def test_update_and_find_one(self): self.mongodb.insert(self.objs[0]) # Get a generator for the results: resGen = self.mongodb.query({"fname": "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual( 'Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_set_coll_use_different_coll(self): # Insert into unittest: self.mongodb.insert(self.objs[0]) # Switch to new_coll: self.mongodb.setCollection('new_coll') self.mongodb.insert({"recommendation": "Hawaii"}) # We're in new_coll; the following should be empty result: self.mongodb.query({"fname": "Franco"}, limit=1) resCount = self.mongodb.resultCount({"fname": "Franco"}) self.assertIsNone( resCount, "Got non-null result that should be null: %s" % resCount) # But this search is within new_coll, and should succeed: resGen = self.mongodb.query({"recommendation": { '$regex': '.*' }}, limit=1) res = resGen.next() self.assertEqual( 'Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) # Try inline collection switch: resGen = self.mongodb.query({"fname": "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual( 'Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) # But the default collection should still be new_coll, # so a search with unspecified coll should be in new_coll: resGen = self.mongodb.query({"recommendation": { '$regex': '.*' }}, limit=1) res = resGen.next() self.assertEqual( 'Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_multi_result(self): # Insert two docs with fname == Franco: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[2]) resGen = self.mongodb.query({"fname": "Franco"}) # To get result count, must retrieve at least one result first: resGen.next() resCount = self.mongodb.resultCount({"fname": "Franco"}) if resCount != 2: self.fail("Added two Franco objects, but only %s are found." % str(resCount)) @unittest.skipIf(not TEST_ALL, "Skipping") def test_clear_collection(self): self.mongodb.insert({"foo": 10}) resGen = self.mongodb.query({"foo": 10}, limit=1) res = resGen.next() self.assertIsNotNone(res, "Did not find document that was just inserted.") self.mongodb.clearCollection() resGen = self.mongodb.query({"foo": 10}, limit=1) self.assertRaises(StopIteration, resGen.next) @unittest.skipIf(not TEST_ALL, "Skipping") def test_only_some_return_columns(self): # Also tests the suppression of _id col when desired: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[1]) resGen = self.mongodb.query({}, ("lname")) names = [] for lnameDict in resGen: resCount = self.mongodb.resultCount({}) self.assertEqual(2, resCount) names.append(lnameDict['lname']) self.assertItemsEqual(['Corelli', 'DaVinci'], names, "Did not receive expected lnames: %s" % str(names))
class EdxForumScrubber(object): ''' Given a .bson file of OpenEdX Forum posts, load the file into a MongoDB. Then pull a post at a time, anonymize, and insert a selection of fields into a MySQL db. The MongoDb entries look like this:: { "_id" : ObjectId("51b75a48f359c40a00000028"), "_type" : "Comment", "abuse_flaggers" : [ ], "anonymous" : false, "anonymous_to_peers" : false, "at_position_list" : [ ], "author_id" : "26344", "author_username" : "Minelly48", "body" : "I am Gwen.I am a nursing professor who took statistics many years ago and want to refresh my knowledge.", "comment_thread_id" : ObjectId("51b754e5f359c40a0000001d"), "course_id" : "Medicine/HRP258/Statistics_in_Medicine", "created_at" : ISODate("2013-06-11T17:11:36.831Z"), "endorsed" : false, "historical_abuse_flaggers" : [ ], "parent_ids" : [ ], "updated_at" : ISODate("2013-06-11T17:11:36.831Z"), "visible" : true, "votes" : { "count" : 2, "down" : [ ], "down_count" : 0, "point" : 2, "up" : [ "40325", "20323" ], "up_count" : 2 }, "sk" : "51b75a48f359c40a00000028" } Depending on parameter allowAnonScreenName in the __init__() method, forum entries in the relational database will be associated with the same hash that is used to anonymize other parts of the OpenEdX data. ''' LOG_DIR = '/home/dataman/Data/EdX/NonTransformLogs' # Pattern for email id - strings of alphabets/numbers/dots/hyphens followed # by an @ or at followed by combinations of dot/. followed by the edu/com # also, allow for spaces emailPattern='(.*)\s+([a-zA-Z0-9\(\.\-]+)[@]([a-zA-Z0-9\.]+)(.)(edu|com)\\s*(.*)' #emailPattern='(.*)\\s+([a-zA-Z0-9\\.]+)\\s*(\\(f.*b.*)?(@)\\s*([a-zA-Z0-9\\.\\s;]+)\\s*(\\.)\\s*(edu|com)\\s+(.*)' compiledEmailPattern = re.compile(emailPattern); # Pattern for replacing embedded double quotes in post bodies, # unless they are already escaped w/ a backslash. The # {0,1} means a match if zero or one repetition. It's # needed so that double quotes at the very start of a # string are matched: no preceding character at all: #doublQuoteReplPattern = re.compile(r'[^\\]{0,1}"') doublQuoteReplPattern = re.compile(r'[\\]{0,}"') def __init__(self, bsonFileName, mysqlDbObj=None, forumTableName='contents', allUsersTableName='EdxPrivate.UserGrade', anonymize=True, allowAnonScreenName=False): ''' Given a .bson file containing OpenEdX Forum entries, anonymize the entries (if desired), and place them into a MySQL table. :param bsonFileName: full path the .bson table. Set to None if instantiating for unit testing. :type bsonFileName: String :param mysqlDbObj: a pymysql_utils.MySQLDB object where anonymized entries are to be placed. If None, a new such object is created into MySQL db 'EdxForum' :type mysqlDbObj: MySQLDB :param forumTableName: name of table into which anonymized Forum entries are to be placed :type forumTableName: String :param allUsersTable: fully qualified name of table listing all in-the-clear mySQLUser names of users who post to the Forum. Used to redact their names from their own posts. :type allUsersTable: String :param anonymize: If true, Forum post entries in the MySQL table will be anonymized :type anonymize: bool :param allow_anon_screen_name: if True, then occurrences of poster's name in post bodies are replaced by <redacName_<anon_screen_name>>, where anon_screen_name is the hash used in other tables of the OpenEdX data. :type allow_anon_screen_name: Bool ''' self.bsonFileName = bsonFileName self.forumTableName = forumTableName self.forumDbName = 'EdxForum' self.allUsersTableName = allUsersTableName self.anonymize = anonymize self.allowAnonScreenName = allowAnonScreenName # If not unittest, but regular run, then mysqlDbObj is None if mysqlDbObj is None: self.mysql_passwd = self.getMySQLPasswd() self.mysql_dbhost ='localhost' self.mysql_user = getpass.getuser() # mySQLUser that started this process self.mydb = MySQLDB(mySQLUser=self.mysql_user, passwd=self.mysql_passwd, db=self.forumDbName) else: self.mydb = mysqlDbObj self.counter=0 self.userCache = {} self.userSet = set() warnings.filterwarnings('ignore', category=MySQLdb.Warning) self.setupLogging() self.prepDatabase() #******mysqldb.commit(); #******logging.info('commit completed!') def runConversion(self): ''' Do the actual work. We don't call this method from __init__() so that unittests can create an EdxForumScrubber instance without doing the actual work. Instead, unittests call individual methods. ''' self.populateUserCache(); self.mongo_database_name = 'TmpForum' self.collection_name = 'contents' # Load bson file into Mongodb: self.loadForumIntoMongoDb(self.bsonFileName) self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name) # Anonymize each forum record, and transfer to MySQL db: self.forumMongoToRelational(self.mongodb, self.mydb,'contents' ) self.mydb.close() self.mongodb.close() self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + self.forumTableName)) def loadForumIntoMongoDb(self, bsonFilename): mongoclient = MongoClient(); db = mongoclient[self.mongo_database_name]; # Get collection object: collection = db[self.collection_name]; # Clear out any old forum entries: self.logInfo('Preparing to delete the collection ') collection.remove() self.logInfo('Deleting mongo collection completed. Will now attempt a mongo restore') self.logInfo('Spawning subprocess to execute mongo restore') with open(self.logFilePath,'w') as outfile: ret = subprocess.call( ['mongorestore', '--drop', '--db', self.mongo_database_name, '--collection', self.collection_name, bsonFilename], stdout=outfile, stderr=outfile) self.logDebug('Return value from mongorestore is %s' % (ret)) objCount = subprocess.check_output( ['mongo', '--quiet', '--eval', 'printjson(db.contents.count())', self.mongo_database_name, ], stderr=outfile) self.numMongoItems = objCount self.logInfo('Available Forum posts %s' % objCount) def forumMongoToRelational(self, mongodb, mysqlDbObj, mysqlTable): ''' Given a pymongo collection object in which Forum posts are stored, and a MySQL db object and table name, anonymize each mongo record, and insert it into the MySQL table. :param collection: collection object obtained via a mangoclient object :type collection: Collection :param mysqlDbObj: wrapper to MySQL db. See pymysql_utils.py :type mysqlDbObj: MYSQLDB :param mysqlTable: name of table where posts are to be deposited. Example: 'contents'. :type mysqlTable: String ''' #command = 'mongorestore %s -db %s -mongoForumRec %s'%(self.bson_filename,self.mongo_database_name,self.collection_name) #print command self.logInfo('Will start inserting from mongo collection to MySQL') for mongoForumRec in mongodb.query({}): mongoRecordObj = MongoRecord(mongoForumRec) try: # Check whether 'up' can be converted to a list list(mongoRecordObj['up']) except Exception as e: self.logInfo('Error in conversion' + `e`) mongoRecordObj['up'] ='-1' self.insert_content_record(mysqlDbObj, mysqlTable, mongoRecordObj); def prepDatabase(self): ''' Declare variables and execute statements preparing the database to configure options - e.g.: setting char set to utf, connection type to utf truncating the already existing table. ''' try: self.logDebug("Setting and assigning char set for mysqld. will truncate old values") self.mydb.execute('SET NAMES utf8;'); self.mydb.execute('SET CHARACTER SET utf8;'); self.mydb.execute('SET character_set_connection=utf8;'); # Compose fully qualified table name from the db name to # which self.mydb is connected, and the forum table name # that was established in __init__(): fullTblName = self.mydb.dbName() + '.' + self.forumTableName # Clear old forum data out of the table: try: self.mydb.dropTable(fullTblName) # Create MySQL table for the posts. If we are to # anonymize, the poster name column will be 'screen_name', # else it will be 'anon_screen_name': self.createForumTable(self.anonymize) self.logDebug("setting and assigning char set complete. Truncation succeeded") except ValueError as e: self.logDebug("Failed either to set character codes, or to create forum table %s: %s" % (fullTblName, `e`)) except MySQLdb.Error,e: self.logInfo("MySql Error exiting %d: %s" % (e.args[0],e.args[1])) # print e sys.exit(1)
class TestForumEtl(unittest.TestCase): # Forum rows have the following columns: # type, anonymous, anonymous_to_peers, at_position_list, user_int_id, body, course_display_name, created_at, votes, count, down_count, up_count, up, down, comment_thread_id, parent_id, parent_ids, sk # Correct result for relationization of tinyForum.json # (in <projDir>/src/forum_etl/data). This result is anonymized and not relatable, # i.e. poster name UIDs use integers, while other tables use hashes: tinyForumGoldAnonymized = \ [ # poster Otto van Homberg: body is clean to start with: ('anon_screen_name_redacted','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for anonymization that allows relating to other tables (i.e. hashes are constant) tinyForumGoldRelatable = \ [ # poster Otto van Homberg: body is clean to start with: ('abc','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('def','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('ghi','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> <nameRedac_abc> <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for non-anonymized forum: tinyForumGoldClear = \ [ # poster Otto van Homberg: body is clean to start with: ('otto_king','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('fritzL','Comment', 'False', 'False', '[]', 7L, ' Body with [email protected] email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster name Otto embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('bebeW','Comment', 'False', 'False', '[]', 10L, 'Body with 650-333-4567 a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name otto_king embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name Otto van Homberg embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] def setUp(self): self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum") # Fill the little MongoDB with test JSON lines self.resetMongoTestDb() self.mysqldb = MySQLDB(user='******', db='unittest') # Start with an empty result MySQL table for each test: self.mysqldb.dropTable('contents') # Fill the fake UserGrade table with records of course participants: self.resetMySQLUserListDb() # Instantiate a Forum scrubber without the # name of a bson file that contains forum # records. That 'None' for the bson file will # make the class understand that it's being # instantiated for a unit test. self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade') self.forumScrubberRelatable = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True) self.forumScrubberClear = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False) def tearDown(self): self.mysqldb.close() @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testAnonymized(self): self.forumScrubberAnonymized.populateUserCache() self.forumScrubberAnonymized.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldAnonymized[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymizedRelatable(self): self.forumScrubberRelatable.populateUserCache() self.forumScrubberRelatable.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldRelatable[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymized(self): self.forumScrubberClear.populateUserCache() self.forumScrubberClear.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldClear[rowNum], forumPost) def resetMongoTestDb(self): self.mongoDb.clearCollection() # Use small, known forum collection: currDir = os.path.dirname(__file__) with open(os.path.join(currDir, 'data/tinyForum.json'), 'r') as jsonFd: for line in jsonFd: forumPost = json.loads(line) self.mongoDb.insert(forumPost) def resetMySQLUserListDb(self): ''' Prepare a MySQL table that mimicks EdxPrivate.UserGrade. ''' userGradeColSpecs = OrderedDict( { 'name' : 'varchar(255)', 'screen_name' : 'varchar(255)', 'grade' : 'int', 'course_id' : 'varchar(255)', 'distinction' : 'tinyint', 'status' : 'varchar(50)', 'user_int_id' : 'int(11)', 'anon_screen_name' : 'varchar(40)' }) self.mysqldb.dropTable('UserGrade') self.mysqldb.createTable('UserGrade', userGradeColSpecs) self.mysqldb.bulkInsert('UserGrade', ('name','screen_name','grade','course_id','distinction','status','user_int_id','anon_screen_name'), [ ('Otto van Homberg','otto_king',5,'oldCourse',0,'notpassing',5,'abc'), ('Andreas Fritz','fritzL',2,'newCourse',0,'notpassing',7,'def'), ('Bebe Winter', 'bebeW',10,'History of Baking',1,'passing',10,'ghi') ])
class TestForumEtl(unittest.TestCase): # Forum rows have the following columns: # type, anonymous, anonymous_to_peers, at_position_list, user_int_id, body, course_display_name, created_at, votes, count, down_count, up_count, up, down, comment_thread_id, parent_id, parent_ids, sk # Correct result for relationization of tinyForum.json # (in <projDir>/src/forum_etl/data). This result is anonymized and not relatable, # i.e. poster name UIDs use integers, while other tables use hashes: tinyForumGoldAnonymized = \ [ # poster Otto van Homberg: body is clean to start with: ('anon_screen_name_redacted','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for anonymization that allows relating to other tables (i.e. hashes are constant) tinyForumGoldRelatable = \ [ # poster Otto van Homberg: body is clean to start with: ('abc','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('def','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('ghi','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> <nameRedac_abc> <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] # Gold result for non-anonymized forum: tinyForumGoldClear = \ [ # poster Otto van Homberg: body is clean to start with: ('otto_king','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None), # poster Andreas Fritz: body has someone's email: ('fritzL','Comment', 'False', 'False', '[]', 7L, ' Body with [email protected] email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'), # poster Otto van Homberg: body has 'Otto': ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster name Otto embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Andreas Fritz: body has a phone number: ('bebeW','Comment', 'False', 'False', '[]', 10L, 'Body with 650-333-4567 a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'), # poster Otto van Homberg: body has his screen name (otto_king): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name otto_king embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'), # poster Otto van Homberg: body has his full name (Otto van Homberg): ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name Otto van Homberg embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007') ] def setUp(self): self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum") # Fill the little MongoDB with test JSON lines self.resetMongoTestDb() self.mysqldb = MySQLDB(mySQLUser='******', db='unittest') # Start with an empty result MySQL table for each test: self.mysqldb.dropTable('contents') # Fill the fake UserGrade table with records of course participants: self.resetMySQLUserListDb() # Instantiate a Forum scrubber without the # name of a bson file that contains forum # records. That 'None' for the bson file will # make the class understand that it's being # instantiated for a unit test. self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade') self.forumScrubberRelatable = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True) self.forumScrubberClear = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False) def tearDown(self): self.mysqldb.close() @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testAnonymized(self): self.forumScrubberAnonymized.populateUserCache() self.forumScrubberAnonymized.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldAnonymized[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymizedRelatable(self): self.forumScrubberRelatable.populateUserCache() self.forumScrubberRelatable.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldRelatable[rowNum], forumPost) @unittest.skipIf(not RUN_ALL_TESTS, 'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.') def testNonAnonymized(self): self.forumScrubberClear.populateUserCache() self.forumScrubberClear.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents') for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')): # print(str(rowNum) + ':' + str(forumPost)) self.assertEqual(TestForumEtl.tinyForumGoldClear[rowNum], forumPost) def resetMongoTestDb(self): self.mongoDb.clearCollection() # Use small, known forum collection: currDir = os.path.dirname(__file__) with open(os.path.join(currDir, 'data/tinyForum.json'), 'r') as jsonFd: for line in jsonFd: forumPost = json.loads(line) self.mongoDb.insert(forumPost) def resetMySQLUserListDb(self): ''' Prepare a MySQL table that mimicks EdxPrivate.UserGrade. ''' userGradeColSpecs = OrderedDict( { 'name' : 'varchar(255)', 'screen_name' : 'varchar(255)', 'grade' : 'int', 'course_id' : 'varchar(255)', 'distinction' : 'tinyint', 'status' : 'varchar(50)', 'user_int_id' : 'int(11)', 'anon_screen_name' : 'varchar(40)' }) self.mysqldb.dropTable('UserGrade') self.mysqldb.createTable('UserGrade', userGradeColSpecs) self.mysqldb.bulkInsert('UserGrade', ('name','screen_name','grade','course_id','distinction','status','user_int_id','anon_screen_name'), [ ('Otto van Homberg','otto_king',5,'oldCourse',0,'notpassing',5,'abc'), ('Andreas Fritz','fritzL',2,'newCourse',0,'notpassing',7,'def'), ('Bebe Winter', 'bebeW',10,'History of Baking',1,'passing',10,'ghi') ])
class MongoTest(unittest.TestCase): ''' Test the mongodb.py module. Uses a library that fakes a MongoDB server. See https://pypi.python.org/pypi/mongomock/1.0.1 ''' def setUp(self): self.objs = [{"fname" : "Franco", "lname" : "Corelli"}, {"fname" : "Leonardo", "lname" : "DaVinci", "age" : 300}, {"fname" : "Franco", "lname" : "Gandolpho"}] self.mongodb = MongoDB(dbName='unittest', collection='unittest') self.mongodb.clearCollection(collection="unittest") self.mongodb.clearCollection(collection="new_coll") self.mongodb.setCollection("unittest") def tearDown(self): self.mongodb.dropCollection(collection='unittest') self.mongodb.dropCollection(collection='new_coll') self.mongodb.close() @unittest.skipIf(not TEST_ALL, "Skipping") def test_update_and_find_one(self): self.mongodb.insert(self.objs[0]) # Get a generator for the results: resGen = self.mongodb.query({"fname" : "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual('Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_set_coll_use_different_coll(self): # Insert into unittest: self.mongodb.insert(self.objs[0]) # Switch to new_coll: self.mongodb.setCollection('new_coll') self.mongodb.insert({"recommendation" : "Hawaii"}) # We're in new_coll; the following should be empty result: self.mongodb.query({"fname" : "Franco"}, limit=1) resCount = self.mongodb.resultCount({"fname" : "Franco"}) self.assertIsNone(resCount, "Got non-null result that should be null: %s" % resCount) # But this search is within new_coll, and should succeed: resGen = self.mongodb.query({"recommendation" : {'$regex' : '.*'}}, limit=1) res = resGen.next() self.assertEqual('Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) # Try inline collection switch: resGen = self.mongodb.query({"fname" : "Franco"}, limit=1, collection="unittest") res = resGen.next() self.assertEqual('Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname'])) # But the default collection should still be new_coll, # so a search with unspecified coll should be in new_coll: resGen = self.mongodb.query({"recommendation" : {'$regex' : '.*'}}, limit=1) res = resGen.next() self.assertEqual('Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation'])) @unittest.skipIf(not TEST_ALL, "Skipping") def test_multi_result(self): # Insert two docs with fname == Franco: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[2]) resGen = self.mongodb.query({"fname" : "Franco"}) # To get result count, must retrieve at least one result first: resGen.next() resCount = self.mongodb.resultCount({"fname" : "Franco"}) if resCount != 2: self.fail("Added two Franco objects, but only %s are found." % str(resCount)) @unittest.skipIf(not TEST_ALL, "Skipping") def test_clear_collection(self): self.mongodb.insert({"foo" : 10}) resGen = self.mongodb.query({"foo" : 10}, limit=1) res = resGen.next() self.assertIsNotNone(res, "Did not find document that was just inserted.") self.mongodb.clearCollection() resGen = self.mongodb.query({"foo" : 10}, limit=1) self.assertRaises(StopIteration, resGen.next) @unittest.skipIf(not TEST_ALL, "Skipping") def test_only_some_return_columns(self): # Also tests the suppression of _id col when desired: self.mongodb.insert(self.objs[0]) self.mongodb.insert(self.objs[1]) resGen = self.mongodb.query({}, ("lname")) names = [] for lnameDict in resGen: resCount = self.mongodb.resultCount({}) self.assertEqual(2, resCount) names.append(lnameDict['lname']) self.assertItemsEqual(['Corelli','DaVinci'], names, "Did not receive expected lnames: %s" % str(names))