Example #1
0
 def setUp(self):
     self.objs = [{"fname" : "Franco", "lname" : "Corelli"}, 
                  {"fname" : "Leonardo", "lname" : "DaVinci", "age" : 300},
                  {"fname" : "Franco", "lname" : "Gandolpho"}]
     
     self.mongodb = MongoDB(dbName='unittest', collection='unittest')
     self.mongodb.clearCollection(collection="unittest")
     self.mongodb.clearCollection(collection="new_coll")
     self.mongodb.setCollection("unittest")
Example #2
0
 def setUp(self):
     
     self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum")
     # Fill the little MongoDB with test JSON lines
     self.resetMongoTestDb()
     
     self.mysqldb = MySQLDB(mySQLUser='******', db='unittest')
     # Start with an empty result MySQL table for each test:
     self.mysqldb.dropTable('contents')
     # Fill the fake UserGrade table with records of course participants:
     self.resetMySQLUserListDb()
     
     # Instantiate a Forum scrubber without the 
     # name of a bson file that contains forum
     # records. That 'None' for the bson file will
     # make the class understand that it's being
     # instantiated for a unit test. 
     self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade')
     self.forumScrubberRelatable  = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True)
     self.forumScrubberClear      = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False)
Example #3
0
    def runConversion(self):
        '''
        Do the actual work. We don't call this method from __init__()
        so that unittests can create an EdxForumScrubber instance without
        doing the actual work. Instead, unittests call individual methods. 
        '''
        self.populateUserCache();

        self.mongo_database_name = 'TmpForum'
        self.collection_name = 'contents'

        # Load bson file into Mongodb:
        self.loadForumIntoMongoDb(self.bsonFileName)
        self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name)
        
        # Anonymize each forum record, and transfer to MySQL db:
        self.forumMongoToRelational(self.mongodb, self.mydb,'contents' )
        
        self.mydb.close()
        self.mongodb.close()
        self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + self.forumTableName))
Example #4
0
 def setUp(self):
     
     self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum")
     # Fill the little MongoDB with test JSON lines
     self.resetMongoTestDb()
     
     self.mysqldb = MySQLDB(user='******', db='unittest')
     # Start with an empty result MySQL table for each test:
     self.mysqldb.dropTable('contents')
     # Fill the fake UserGrade table with records of course participants:
     self.resetMySQLUserListDb()
     
     # Instantiate a Forum scrubber without the 
     # name of a bson file that contains forum
     # records. That 'None' for the bson file will
     # make the class understand that it's being
     # instantiated for a unit test. 
     self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade')
     self.forumScrubberRelatable  = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True)
     self.forumScrubberClear      = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False)
Example #5
0
    def runConversion(self):
        '''
        Do the actual work. We don't call this method from __init__()
        so that unittests can create an EdxForumScrubber instance without
        doing the actual work. Instead, unittests call individual methods.
        '''
        self.populateUserCache();

        self.mongo_database_name = 'TmpForum'
        self.collection_name = 'contents'

        # Load bson file into Mongodb:
        self.loadForumIntoMongoDb(self.bsonFileName)
        self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name)

        # Anonymize each forum record, and transfer to MySQL db:
        self.forumMongoToRelational(self.mongodb, self.mydb,'contents' )

        self.mydb.close()
        self.mongodb.close()
        self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + '.' + self.forumTableName))
Example #6
0
class EdxForumScrubber(object):
    '''

    Given a .bson file of OpenEdX Forum posts, load the file
    into a MongoDB. Then pull a post at a time, anonymize, and
    insert a selection of fields into a MySQL db. The MongoDb
    entries look like this::

    {
    	"_id" : ObjectId("51b75a48f359c40a00000028"),
    	"_type" : "Comment",
    	"abuse_flaggers" : [ ],
    	"anonymous" : false,
    	"anonymous_to_peers" : false,
    	"at_position_list" : [ ],
    	"author_id" : "26344",
    	"author_username" : "Minelly48",
    	"body" : "I am Gwen.I am a nursing professor who took statistics many years ago and want to refresh my knowledge.",
    	"comment_thread_id" : ObjectId("51b754e5f359c40a0000001d"),
    	"course_id" : "Medicine/HRP258/Statistics_in_Medicine",
    	"created_at" : ISODate("2013-06-11T17:11:36.831Z"),
    	"endorsed" : false,
    	"historical_abuse_flaggers" : [ ],
    	"parent_ids" : [ ],
    	"updated_at" : ISODate("2013-06-11T17:11:36.831Z"),
    	"visible" : true,
    	"votes" : {
    		"count" : 2,
    		"down" : [ ],
    		"down_count" : 0,
    		"point" : 2,
    		"up" : [
    			"40325",
    			"20323"
    		],
    		"up_count" : 2
    	},
    	"sk" : "51b75a48f359c40a00000028"
    }

    Depending on parameter allowAnonScreenName in the __init__() method,
    forum entries in the relational database will be associated with the
    same hash that is used to anonymize other parts of the OpenEdX data.

    '''

    LOG_DIR = '/home/dataman/Data/EdX/NonTransformLogs'

    # Pattern for email id - strings of alphabets/numbers/dots/hyphens followed
    # by an @ or at followed by combinations of dot/. followed by the edu/com
    # also, allow for spaces

    emailPattern='(.*)\s+([a-zA-Z0-9\(\.\-]+)[@]([a-zA-Z0-9\.]+)(.)(edu|com)\\s*(.*)'
    #emailPattern='(.*)\\s+([a-zA-Z0-9\\.]+)\\s*(\\(f.*b.*)?(@)\\s*([a-zA-Z0-9\\.\\s;]+)\\s*(\\.)\\s*(edu|com)\\s+(.*)'
    compiledEmailPattern = re.compile(emailPattern);

    # Pattern for replacing embedded double quotes in post bodies,
    # unless they are already escaped w/ a backslash. The
    # {0,1} means a match if zero or one repetition. It's
    # needed so that double quotes at the very start of a
    # string are matched: no preceding character at all:
    #doublQuoteReplPattern = re.compile(r'[^\\]{0,1}"')
    doublQuoteReplPattern = re.compile(r'[\\]{0,}"')

    # Schema of EdxForum.contents: an ordered dict that is
    # used twice: the table creation MySQL command is constructed
    # from this dict, and the dict is used to ensure that
    # all its keys (i.e. future column names) are present
    # in each MongoDB object. See also createForumTable().
    # In createForumTable() either entry anon_screen_name,
    # or screen_name in the dict below will be deleted, based
    # on whether we are asked to anonymize or not:

    forumSchema = OrderedDict({})

    forumSchema['forum_post_id'] =  "varchar(40) NOT NULL DEFAULT 'unavailable'"
    forumSchema['anon_screen_name'] =  "varchar(40) NOT NULL DEFAULT 'anon_screen_name_redacted'"  # This or next deleted based on anonymize yes/no
    forumSchema['screen_name'] =  "varchar(40) NOT NULL DEFAULT 'anon_screen_name_redacted'"       # This or prev deleted based on anonymize yes/no
    forumSchema['type'] =  "varchar(20) NOT NULL"
    forumSchema['anonymous'] =  "varchar(10) NOT NULL"
    forumSchema['anonymous_to_peers'] =  "varchar(10) NOT NULL"
    forumSchema['at_position_list'] =  "varchar(200) NOT NULL"
    forumSchema['forum_uid'] =  "varchar(40)  NOT NULL"
    forumSchema['body'] = "TEXT NOT NULL" #"varchar(2500) NOT NULL"
    forumSchema['course_display_name'] =  "varchar(100) NOT NULL"
    forumSchema['created_at'] =  "datetime NOT NULL"
    forumSchema['votes'] = "TEXT NOT NULL" # "varchar(200) NOT NULL"
    forumSchema['count'] =  "int(11) NOT NULL"
    forumSchema['down_count'] =  "int(11) NOT NULL"
    forumSchema['up_count'] =  "int(11) NOT NULL"
    forumSchema['up'] =  "varchar(200) DEFAULT NULL"
    forumSchema['down'] =  "varchar(200) DEFAULT NULL"
    forumSchema['comment_thread_id'] =  "varchar(255) DEFAULT NULL"
    forumSchema['parent_id'] =  "varchar(255) DEFAULT NULL"
    forumSchema['parent_ids'] =  "varchar(255) DEFAULT NULL"
    forumSchema['sk'] =  "varchar(255) DEFAULT NULL"
    forumSchema['confusion'] =  "varchar(20) NOT NULL DEFAULT ''"
    forumSchema['happiness'] =  "varchar(20) NOT NULL DEFAULT ''"


    def __init__(self,
                 bsonFileName,
                 mysqlDbObj=None,
                 forumTableName='contents',
                 allUsersTableName='EdxPrivate.UserGrade',
                 anonymize=True,
                 allowAnonScreenName=False):
        '''
        Given a .bson file containing OpenEdX Forum entries, anonymize the entries (if desired),
        and place them into a MySQL table.

        :param bsonFileName: full path the .bson table. Set to None if instantiating
            for unit testing.
        :type bsonFileName: String
        :param mysqlDbObj: a pymysql_utils.MySQLDB object where anonymized entries are
            to be placed. If None, a new such object is created into MySQL db 'EdxForum'
        :type mysqlDbObj: MySQLDB
        :param forumTableName: name of table into which anonymized Forum entries are to be placed
        :type forumTableName: String
        :param allUsersTable: fully qualified name of table listing all in-the-clear mySQLUser names
            of users who post to the Forum. Used to redact their names from their own posts.
        :type allUsersTable: String
        :param anonymize: If true, Forum post entries in the MySQL table will be anonymized
        :type anonymize: bool
        :param allow_anon_screen_name: if True, then occurrences of poster's name in
            post bodies are replaced by <redacName_<anon_screen_name>>, where anon_screen_name
            is the hash used in other tables of the OpenEdX data.
        :type allow_anon_screen_name: Bool
        '''

        self.bsonFileName = bsonFileName
        self.forumTableName = forumTableName
        self.forumDbName = 'EdxForum'
        self.allUsersTableName = allUsersTableName
        self.anonymize = anonymize
        self.allowAnonScreenName = allowAnonScreenName

        # If not unittest, but regular run, then mysqlDbObj is None
        if mysqlDbObj is None:
            self.mysql_passwd = self.getMySQLPasswd()
            self.mysql_dbhost ='localhost'
            self.mysql_user = getpass.getuser() # mySQLUser that started this process
            self.mydb = MySQLDB(user=self.mysql_user, passwd=self.mysql_passwd, db=self.forumDbName)
        else:
            self.mydb = mysqlDbObj

        self.counter=0

        self.userCache = {}
        self.userSet   = set()

        warnings.filterwarnings('ignore', category=MySQLdb.Warning)
        self.setupLogging()
        self.prepDatabase()

        #******mysqldb.commit();
        #******logging.info('commit completed!')

    def runConversion(self):
        '''
        Do the actual work. We don't call this method from __init__()
        so that unittests can create an EdxForumScrubber instance without
        doing the actual work. Instead, unittests call individual methods.
        '''
        self.populateUserCache();

        self.mongo_database_name = 'TmpForum'
        self.collection_name = 'contents'

        # Load bson file into Mongodb:
        self.loadForumIntoMongoDb(self.bsonFileName)
        self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name)

        # Anonymize each forum record, and transfer to MySQL db:
        self.forumMongoToRelational(self.mongodb, self.mydb,'contents' )

        self.mydb.close()
        self.mongodb.close()
        self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + '.' + self.forumTableName))

    def loadForumIntoMongoDb(self, bsonFilename):

        mongoclient = MongoClient();
        db = mongoclient[self.mongo_database_name];

        # Get collection object:
        collection = db[self.collection_name];

        # Clear out any old forum entries:
        self.logInfo('Preparing to delete the collection ')
        collection.remove()
        self.logInfo('Deleting mongo collection completed. Will now attempt a mongo restore')

        self.logInfo('Spawning subprocess to execute mongo restore')
        with open(self.logFilePath,'w') as outfile:
            ret = subprocess.call(
                   ['mongorestore',
                    '--drop',
                    '--db', self.mongo_database_name,
                    '--collection', self.collection_name,
                    bsonFilename],
                stdout=outfile, stderr=outfile)

            self.logDebug('Return value from mongorestore is %s' % (ret))

            objCount = subprocess.check_output(
                       ['mongo',
                        '--quiet',
                        '--eval',
                        'printjson(db.contents.count())',
                        self.mongo_database_name,
                        ],
                        stderr=outfile)
            self.numMongoItems = objCount

            self.logInfo('Available Forum posts %s' % objCount)

    def forumMongoToRelational(self, mongodb, mysqlDbObj, mysqlTable):
        '''
        Given a pymongo collection object in which Forum posts are stored,
        and a MySQL db object and table name, anonymize each mongo record,
        and insert it into the MySQL table.

        :param collection: collection object obtained via a mangoclient object
        :type collection: Collection
        :param mysqlDbObj: wrapper to MySQL db. See pymysql_utils.py
        :type mysqlDbObj: MYSQLDB
        :param mysqlTable: name of table where posts are to be deposited.
            Example: 'contents'.
        :type mysqlTable: String
        '''

        #command = 'mongorestore %s -db %s -mongoForumRec %s'%(self.bson_filename,self.mongo_database_name,self.collection_name)
        #print command

        self.logInfo('Will start inserting from mongo collection to MySQL')

        for mongoForumRec in mongodb.query({}):
            mongoRecordObj = MongoRecord(mongoForumRec)

            try:
                # Check whether 'up' can be converted to a list
                list(mongoRecordObj['up'])
            except Exception as e:
                self.logInfo("Error in conversion of 'up' field to a list (setting cell to -1):" + `e`)
                mongoRecordObj['up'] ='-1'

            # Make sure the MongoDB object has all fields that will
            # be needed for the forum schema:
            self.ensureSchemaAdherence(mongoRecordObj)

            self.insert_content_record(mysqlDbObj, mysqlTable, mongoRecordObj);

    def prepDatabase(self):
        '''
        Declare variables and execute statements preparing the database to
        configure options - e.g.: setting char set to utf, connection type to utf
        truncating the already existing table.
        '''
        try:
            self.logDebug("Setting and assigning char set for mysqld. will truncate old values")
            self.mydb.execute('SET NAMES utf8;');
            self.mydb.execute('SET CHARACTER SET utf8;');
            self.mydb.execute('SET character_set_connection=utf8;');

            # Compose fully qualified table name from the db name to
            # which self.mydb is connected, and the forum table name
            # that was established in __init__():
            fullTblName = self.mydb.dbName() + '.' + self.forumTableName
            # Clear old forum data out of the table:
            try:
                self.mydb.dropTable(fullTblName)
                # Create MySQL table for the posts. If we are to
                # anonymize, the poster name column will be 'screen_name',
                # else it will be 'anon_screen_name':
                self.createForumTable(self.anonymize)
                self.logDebug("setting and assigning char set complete. Truncation succeeded")
            except ValueError as e:
                self.logDebug("Failed either to set character codes, or to create forum table %s: %s" % (fullTblName, `e`))

        except MySQLdb.Error,e:
            self.logInfo("MySql Error exiting %d: %s" % (e.args[0],e.args[1]))
            # print e
            sys.exit(1)
Example #7
0
class MongoTest(unittest.TestCase):
    '''
    Test the mongodb.py module. Uses a library that fakes
    a MongoDB server. See https://pypi.python.org/pypi/mongomock/1.0.1
    '''
    def setUp(self):
        self.objs = [{
            "fname": "Franco",
            "lname": "Corelli"
        }, {
            "fname": "Leonardo",
            "lname": "DaVinci",
            "age": 300
        }, {
            "fname": "Franco",
            "lname": "Gandolpho"
        }]

        self.mongodb = MongoDB(dbName='unittest', collection='unittest')
        self.mongodb.clearCollection(collection="unittest")
        self.mongodb.clearCollection(collection="new_coll")
        self.mongodb.setCollection("unittest")

    def tearDown(self):
        self.mongodb.dropCollection(collection='unittest')
        self.mongodb.dropCollection(collection='new_coll')
        self.mongodb.close()

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_update_and_find_one(self):
        self.mongodb.insert(self.objs[0])
        # Get a generator for the results:
        resGen = self.mongodb.query({"fname": "Franco"},
                                    limit=1,
                                    collection="unittest")
        res = resGen.next()
        self.assertEqual(
            'Corelli', res['lname'],
            "Failed retrieval of single obj; expected '%s' but got '%s'" %
            ('Corelli', res['lname']))

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_set_coll_use_different_coll(self):
        # Insert into unittest:
        self.mongodb.insert(self.objs[0])
        # Switch to new_coll:
        self.mongodb.setCollection('new_coll')
        self.mongodb.insert({"recommendation": "Hawaii"})

        # We're in new_coll; the following should be empty result:
        self.mongodb.query({"fname": "Franco"}, limit=1)
        resCount = self.mongodb.resultCount({"fname": "Franco"})
        self.assertIsNone(
            resCount, "Got non-null result that should be null: %s" % resCount)

        # But this search is within new_coll, and should succeed:
        resGen = self.mongodb.query({"recommendation": {
            '$regex': '.*'
        }},
                                    limit=1)
        res = resGen.next()
        self.assertEqual(
            'Hawaii', res['recommendation'],
            "Failed retrieval of single obj; expected '%s' but got '%s'" %
            ('Hawaii', res['recommendation']))

        # Try inline collection switch:
        resGen = self.mongodb.query({"fname": "Franco"},
                                    limit=1,
                                    collection="unittest")
        res = resGen.next()
        self.assertEqual(
            'Corelli', res['lname'],
            "Failed retrieval of single obj; expected '%s' but got '%s'" %
            ('Corelli', res['lname']))

        # But the default collection should still be new_coll,
        # so a search with unspecified coll should be in new_coll:
        resGen = self.mongodb.query({"recommendation": {
            '$regex': '.*'
        }},
                                    limit=1)
        res = resGen.next()
        self.assertEqual(
            'Hawaii', res['recommendation'],
            "Failed retrieval of single obj; expected '%s' but got '%s'" %
            ('Hawaii', res['recommendation']))

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_multi_result(self):
        # Insert two docs with fname == Franco:
        self.mongodb.insert(self.objs[0])
        self.mongodb.insert(self.objs[2])
        resGen = self.mongodb.query({"fname": "Franco"})
        # To get result count, must retrieve at least one result first:
        resGen.next()
        resCount = self.mongodb.resultCount({"fname": "Franco"})
        if resCount != 2:
            self.fail("Added two Franco objects, but only %s are found." %
                      str(resCount))

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_clear_collection(self):
        self.mongodb.insert({"foo": 10})
        resGen = self.mongodb.query({"foo": 10}, limit=1)
        res = resGen.next()
        self.assertIsNotNone(res,
                             "Did not find document that was just inserted.")
        self.mongodb.clearCollection()
        resGen = self.mongodb.query({"foo": 10}, limit=1)
        self.assertRaises(StopIteration, resGen.next)

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_only_some_return_columns(self):
        # Also tests the suppression of _id col when desired:
        self.mongodb.insert(self.objs[0])
        self.mongodb.insert(self.objs[1])
        resGen = self.mongodb.query({}, ("lname"))
        names = []
        for lnameDict in resGen:
            resCount = self.mongodb.resultCount({})
            self.assertEqual(2, resCount)
            names.append(lnameDict['lname'])
        self.assertItemsEqual(['Corelli', 'DaVinci'], names,
                              "Did not receive expected lnames: %s" %
                              str(names))
Example #8
0
class EdxForumScrubber(object):
    '''
    
    Given a .bson file of OpenEdX Forum posts, load the file
    into a MongoDB. Then pull a post at a time, anonymize, and
    insert a selection of fields into a MySQL db. The MongoDb
    entries look like this::
    
    {   
    	"_id" : ObjectId("51b75a48f359c40a00000028"),
    	"_type" : "Comment",
    	"abuse_flaggers" : [ ],
    	"anonymous" : false,
    	"anonymous_to_peers" : false,
    	"at_position_list" : [ ],
    	"author_id" : "26344",
    	"author_username" : "Minelly48",
    	"body" : "I am Gwen.I am a nursing professor who took statistics many years ago and want to refresh my knowledge.",
    	"comment_thread_id" : ObjectId("51b754e5f359c40a0000001d"),
    	"course_id" : "Medicine/HRP258/Statistics_in_Medicine",
    	"created_at" : ISODate("2013-06-11T17:11:36.831Z"),
    	"endorsed" : false,
    	"historical_abuse_flaggers" : [ ],
    	"parent_ids" : [ ],
    	"updated_at" : ISODate("2013-06-11T17:11:36.831Z"),
    	"visible" : true,
    	"votes" : {
    		"count" : 2,
    		"down" : [ ],
    		"down_count" : 0,
    		"point" : 2,
    		"up" : [
    			"40325",
    			"20323"
    		],
    		"up_count" : 2
    	},
    	"sk" : "51b75a48f359c40a00000028"
    }    
    
    Depending on parameter allowAnonScreenName in the __init__() method,
    forum entries in the relational database will be associated with the
    same hash that is used to anonymize other parts of the OpenEdX data.
    
    '''
    
    LOG_DIR = '/home/dataman/Data/EdX/NonTransformLogs'

    # Pattern for email id - strings of alphabets/numbers/dots/hyphens followed
    # by an @ or at followed by combinations of dot/. followed by the edu/com
    # also, allow for spaces
    
    emailPattern='(.*)\s+([a-zA-Z0-9\(\.\-]+)[@]([a-zA-Z0-9\.]+)(.)(edu|com)\\s*(.*)'
    #emailPattern='(.*)\\s+([a-zA-Z0-9\\.]+)\\s*(\\(f.*b.*)?(@)\\s*([a-zA-Z0-9\\.\\s;]+)\\s*(\\.)\\s*(edu|com)\\s+(.*)'
    compiledEmailPattern = re.compile(emailPattern);

    # Pattern for replacing embedded double quotes in post bodies,
    # unless they are already escaped w/ a backslash. The
    # {0,1} means a match if zero or one repetition. It's
    # needed so that double quotes at the very start of a 
    # string are matched: no preceding character at all: 
    #doublQuoteReplPattern = re.compile(r'[^\\]{0,1}"')
    doublQuoteReplPattern = re.compile(r'[\\]{0,}"')
    
    def __init__(self, 
                 bsonFileName, 
                 mysqlDbObj=None, 
                 forumTableName='contents', 
                 allUsersTableName='EdxPrivate.UserGrade',
                 anonymize=True,
                 allowAnonScreenName=False):
        '''
        Given a .bson file containing OpenEdX Forum entries, anonymize the entries (if desired),
        and place them into a MySQL table.  
        
        :param bsonFileName: full path the .bson table. Set to None if instantiating
            for unit testing.
        :type bsonFileName: String
        :param mysqlDbObj: a pymysql_utils.MySQLDB object where anonymized entries are
            to be placed. If None, a new such object is created into MySQL db 'EdxForum'
        :type mysqlDbObj: MySQLDB
        :param forumTableName: name of table into which anonymized Forum entries are to be placed
        :type forumTableName: String
        :param allUsersTable: fully qualified name of table listing all in-the-clear mySQLUser names
            of users who post to the Forum. Used to redact their names from their own posts.
        :type allUsersTable: String
        :param anonymize: If true, Forum post entries in the MySQL table will be anonymized
        :type anonymize: bool
        :param allow_anon_screen_name: if True, then occurrences of poster's name in
            post bodies are replaced by <redacName_<anon_screen_name>>, where anon_screen_name
            is the hash used in other tables of the OpenEdX data.
        :type allow_anon_screen_name: Bool 
        '''
        
        self.bsonFileName = bsonFileName
        self.forumTableName = forumTableName
        self.forumDbName = 'EdxForum'
        self.allUsersTableName = allUsersTableName
        self.anonymize = anonymize
        self.allowAnonScreenName = allowAnonScreenName
        
        # If not unittest, but regular run, then mysqlDbObj is None
        if mysqlDbObj is None:
            self.mysql_passwd = self.getMySQLPasswd()
            self.mysql_dbhost ='localhost'
            self.mysql_user = getpass.getuser() # mySQLUser that started this process
            self.mydb = MySQLDB(mySQLUser=self.mysql_user, passwd=self.mysql_passwd, db=self.forumDbName)
        else:
            self.mydb = mysqlDbObj

        self.counter=0
        
        self.userCache = {}
        self.userSet   = set()

        warnings.filterwarnings('ignore', category=MySQLdb.Warning)        
        self.setupLogging()
        self.prepDatabase()

        #******mysqldb.commit();    
        #******logging.info('commit completed!')

    def runConversion(self):
        '''
        Do the actual work. We don't call this method from __init__()
        so that unittests can create an EdxForumScrubber instance without
        doing the actual work. Instead, unittests call individual methods. 
        '''
        self.populateUserCache();

        self.mongo_database_name = 'TmpForum'
        self.collection_name = 'contents'

        # Load bson file into Mongodb:
        self.loadForumIntoMongoDb(self.bsonFileName)
        self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name)
        
        # Anonymize each forum record, and transfer to MySQL db:
        self.forumMongoToRelational(self.mongodb, self.mydb,'contents' )
        
        self.mydb.close()
        self.mongodb.close()
        self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + self.forumTableName))

    def loadForumIntoMongoDb(self, bsonFilename):

        mongoclient = MongoClient();
        db = mongoclient[self.mongo_database_name];

        # Get collection object:
        collection = db[self.collection_name];

        # Clear out any old forum entries:
        self.logInfo('Preparing to delete the collection ')
        collection.remove()
        self.logInfo('Deleting mongo collection completed. Will now attempt a mongo restore')
        
        self.logInfo('Spawning subprocess to execute mongo restore')
        with open(self.logFilePath,'w') as outfile:
            ret = subprocess.call(
                   ['mongorestore',
                    '--drop',
                    '--db', self.mongo_database_name, 
                    '--collection', self.collection_name,
                    bsonFilename], 
                stdout=outfile, stderr=outfile)

            self.logDebug('Return value from mongorestore is %s' % (ret))

            objCount = subprocess.check_output(
                       ['mongo',
                        '--quiet',
                        '--eval',
                        'printjson(db.contents.count())',
                        self.mongo_database_name, 
                        ], 
                        stderr=outfile)
            self.numMongoItems = objCount
            
            self.logInfo('Available Forum posts %s' % objCount)

    def forumMongoToRelational(self, mongodb, mysqlDbObj, mysqlTable):
        '''
        Given a pymongo collection object in which Forum posts are stored,
        and a MySQL db object and table name, anonymize each mongo record,
        and insert it into the MySQL table.
        
        :param collection: collection object obtained via a mangoclient object
        :type collection: Collection
        :param mysqlDbObj: wrapper to MySQL db. See pymysql_utils.py
        :type mysqlDbObj: MYSQLDB
        :param mysqlTable: name of table where posts are to be deposited.
            Example: 'contents'.
        :type mysqlTable: String
        '''

        #command = 'mongorestore %s -db %s -mongoForumRec %s'%(self.bson_filename,self.mongo_database_name,self.collection_name)
        #print command
    
        self.logInfo('Will start inserting from mongo collection to MySQL')

        for mongoForumRec in mongodb.query({}):
            mongoRecordObj = MongoRecord(mongoForumRec)

            try:
                # Check whether 'up' can be converted to a list
                list(mongoRecordObj['up'])
            except Exception as e:
                self.logInfo('Error in conversion' + `e`)
                mongoRecordObj['up'] ='-1'
            
            self.insert_content_record(mysqlDbObj, mysqlTable, mongoRecordObj);
        
    def prepDatabase(self):
        '''
        Declare variables and execute statements preparing the database to 
        configure options - e.g.: setting char set to utf, connection type to utf
        truncating the already existing table.
        '''
        try:
            self.logDebug("Setting and assigning char set for mysqld. will truncate old values")
            self.mydb.execute('SET NAMES utf8;');
            self.mydb.execute('SET CHARACTER SET utf8;');
            self.mydb.execute('SET character_set_connection=utf8;');
            
            # Compose fully qualified table name from the db name to 
            # which self.mydb is connected, and the forum table name
            # that was established in __init__():
            fullTblName = self.mydb.dbName() + '.' + self.forumTableName
            # Clear old forum data out of the table:
            try:
                self.mydb.dropTable(fullTblName)
                # Create MySQL table for the posts. If we are to
                # anonymize, the poster name column will be 'screen_name',
                # else it will be 'anon_screen_name':
                self.createForumTable(self.anonymize)
                self.logDebug("setting and assigning char set complete. Truncation succeeded")                
            except ValueError as e:
                self.logDebug("Failed either to set character codes, or to create forum table %s: %s" % (fullTblName, `e`))
        
        except MySQLdb.Error,e:
            self.logInfo("MySql Error exiting %d: %s" % (e.args[0],e.args[1]))
            # print e
            sys.exit(1)
Example #9
0
class TestForumEtl(unittest.TestCase):

    # Forum rows have the following columns:
    #  type, anonymous, anonymous_to_peers, at_position_list, user_int_id, body, course_display_name, created_at, votes, count, down_count, up_count, up, down, comment_thread_id, parent_id, parent_ids, sk   

    # Correct result for relationization of tinyForum.json
    # (in <projDir>/src/forum_etl/data). This result is anonymized and not relatable,
    # i.e. poster name UIDs use integers, while other tables use hashes:
    tinyForumGoldAnonymized = \
    [
    # poster Otto van Homberg: body is clean to start with:
    ('anon_screen_name_redacted','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None),
    # poster Andreas Fritz: body has someone's email:
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'),
    # poster Otto van Homberg: body has 'Otto':
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),
    # poster Andreas Fritz: body has a phone number:
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'),
    # poster Otto van Homberg: body has his screen name (otto_king):
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),    
    # poster Otto van Homberg: body has his full name (Otto van Homberg):
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007')    
    ]
    
    # Gold result for anonymization that allows relating to other tables (i.e. hashes are constant)
    tinyForumGoldRelatable = \
    [
    # poster Otto van Homberg: body is clean to start with:
    ('abc','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None),
    # poster Andreas Fritz: body has someone's email:
    ('def','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'),
    # poster Otto van Homberg: body has 'Otto':
    ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),
    # poster Andreas Fritz: body has a phone number:
    ('ghi','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'),
    # poster Otto van Homberg: body has his screen name (otto_king):
    ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),    
    # poster Otto van Homberg: body has his full name (Otto van Homberg):
    ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> <nameRedac_abc> <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007')    
    ]
    
    # Gold result for non-anonymized forum:
    tinyForumGoldClear = \
    [
    # poster Otto van Homberg: body is clean to start with:
    ('otto_king','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None),
    # poster Andreas Fritz: body has someone's email:
    ('fritzL','Comment', 'False', 'False', '[]', 7L, ' Body with [email protected] email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'),
    # poster Otto van Homberg: body has 'Otto':
    ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster name Otto embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),
    # poster Andreas Fritz: body has a phone number:
    ('bebeW','Comment', 'False', 'False', '[]', 10L, 'Body with 650-333-4567 a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'),
    # poster Otto van Homberg: body has his screen name (otto_king):
    ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name otto_king embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),    
    # poster Otto van Homberg: body has his full name (Otto van Homberg):
    ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name Otto van Homberg embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007')    
    ]    

    def setUp(self):
        
        self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum")
        # Fill the little MongoDB with test JSON lines
        self.resetMongoTestDb()
        
        self.mysqldb = MySQLDB(user='******', db='unittest')
        # Start with an empty result MySQL table for each test:
        self.mysqldb.dropTable('contents')
        # Fill the fake UserGrade table with records of course participants:
        self.resetMySQLUserListDb()
        
        # Instantiate a Forum scrubber without the 
        # name of a bson file that contains forum
        # records. That 'None' for the bson file will
        # make the class understand that it's being
        # instantiated for a unit test. 
        self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade')
        self.forumScrubberRelatable  = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True)
        self.forumScrubberClear      = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False)

    def tearDown(self):
        self.mysqldb.close()

    @unittest.skipIf(not RUN_ALL_TESTS, 
                     'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.')    
    def testAnonymized(self):
        self.forumScrubberAnonymized.populateUserCache()
        self.forumScrubberAnonymized.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents')  
        for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')):
            # print(str(rowNum) + ':' + str(forumPost))
            self.assertEqual(TestForumEtl.tinyForumGoldAnonymized[rowNum], forumPost)
            
    @unittest.skipIf(not RUN_ALL_TESTS, 
                     'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.')    
    def testNonAnonymizedRelatable(self):
        self.forumScrubberRelatable.populateUserCache()
        self.forumScrubberRelatable.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents')  
        for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')):
            # print(str(rowNum) + ':' + str(forumPost))
            self.assertEqual(TestForumEtl.tinyForumGoldRelatable[rowNum], forumPost)

    @unittest.skipIf(not RUN_ALL_TESTS, 
                     'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.')    
    def testNonAnonymized(self):
        self.forumScrubberClear.populateUserCache()
        self.forumScrubberClear.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents')  
        for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')):
            # print(str(rowNum) + ':' + str(forumPost))
            self.assertEqual(TestForumEtl.tinyForumGoldClear[rowNum], forumPost)


    
    def resetMongoTestDb(self):
        self.mongoDb.clearCollection()
        # Use small, known forum collection:
        currDir = os.path.dirname(__file__)     
        with open(os.path.join(currDir, 'data/tinyForum.json'), 'r') as jsonFd:
            for line in jsonFd:
                forumPost = json.loads(line)
                self.mongoDb.insert(forumPost)

    def resetMySQLUserListDb(self):
        '''
        Prepare a MySQL table that mimicks EdxPrivate.UserGrade.
        '''
        
        userGradeColSpecs = OrderedDict(
                                        {
                                         'name' : 'varchar(255)',
                                         'screen_name' : 'varchar(255)',
                                         'grade' : 'int',
                                         'course_id' : 'varchar(255)',
                                         'distinction' : 'tinyint',
                                         'status' : 'varchar(50)',
                                         'user_int_id' : 'int(11)',
                                         'anon_screen_name' : 'varchar(40)'
                                         })
        self.mysqldb.dropTable('UserGrade')
        self.mysqldb.createTable('UserGrade', userGradeColSpecs)
        self.mysqldb.bulkInsert('UserGrade', 
                                ('name','screen_name','grade','course_id','distinction','status','user_int_id','anon_screen_name'),
                                [
                                 ('Otto van Homberg','otto_king',5,'oldCourse',0,'notpassing',5,'abc'),
                                 ('Andreas Fritz','fritzL',2,'newCourse',0,'notpassing',7,'def'),
                                 ('Bebe Winter', 'bebeW',10,'History of Baking',1,'passing',10,'ghi')
                                 ])
Example #10
0
class TestForumEtl(unittest.TestCase):

    # Forum rows have the following columns:
    #  type, anonymous, anonymous_to_peers, at_position_list, user_int_id, body, course_display_name, created_at, votes, count, down_count, up_count, up, down, comment_thread_id, parent_id, parent_ids, sk   

    # Correct result for relationization of tinyForum.json
    # (in <projDir>/src/forum_etl/data). This result is anonymized and not relatable,
    # i.e. poster name UIDs use integers, while other tables use hashes:
    tinyForumGoldAnonymized = \
    [
    # poster Otto van Homberg: body is clean to start with:
    ('anon_screen_name_redacted','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None),
    # poster Andreas Fritz: body has someone's email:
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'),
    # poster Otto van Homberg: body has 'Otto':
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),
    # poster Andreas Fritz: body has a phone number:
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'),
    # poster Otto van Homberg: body has his screen name (otto_king):
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),    
    # poster Otto van Homberg: body has his full name (Otto van Homberg):
    ('anon_screen_name_redacted','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> <nameRedac_anon_screen_name_redacted> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007')    
    ]
    
    # Gold result for anonymization that allows relating to other tables (i.e. hashes are constant)
    tinyForumGoldRelatable = \
    [
    # poster Otto van Homberg: body is clean to start with:
    ('abc','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None),
    # poster Andreas Fritz: body has someone's email:
    ('def','Comment', 'False', 'False', '[]', 7L, ' Body with <emailRedac> email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'),
    # poster Otto van Homberg: body has 'Otto':
    ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),
    # poster Andreas Fritz: body has a phone number:
    ('ghi','Comment', 'False', 'False', '[]', 10L, 'Body with <phoneRedac> a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'),
    # poster Otto van Homberg: body has his screen name (otto_king):
    ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),    
    # poster Otto van Homberg: body has his full name (Otto van Homberg):
    ('abc','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name <nameRedac_abc> <nameRedac_abc> <nameRedac_abc> embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007')    
    ]
    
    # Gold result for non-anonymized forum:
    tinyForumGoldClear = \
    [
    # poster Otto van Homberg: body is clean to start with:
    ('otto_king','CommentThread', 'False', 'False', '[]', 5L, 'Harmless body', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 20), "{u'count': 10, u'point': -6, u'down_count': 8, u'up': [u'2', u'10'], u'down': [u'1', u'3', u'4', u'5', u'6', u'7', u'8', u'9'], u'up_count': 2}", 10L, 8L, 2L, "['2', '10']", "['1', '3', '4', '5', '6', '7', '8', '9']", None, None, None, None),
    # poster Andreas Fritz: body has someone's email:
    ('fritzL','Comment', 'False', 'False', '[]', 7L, ' Body with [email protected] email.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 10, u'point': -4, u'down_count': 7, u'up': [u'6', u'8', u'10'], u'down': [u'1', u'2', u'3', u'4', u'5', u'7', u'9'], u'up_count': 3}", 10L, 7L, 3L, "['6', '8', '10']", "['1', '2', '3', '4', '5', '7', '9']", '519461545924670200000001', None, '[]', '519461555924670200000006'),
    # poster Otto van Homberg: body has 'Otto':
    ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster name Otto embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),
    # poster Andreas Fritz: body has a phone number:
    ('bebeW','Comment', 'False', 'False', '[]', 10L, 'Body with 650-333-4567 a phone number.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461545924670200000005', "[u'519461545924670200000005']", '519461545924670200000005-519461555924670200000008'),
    # poster Otto van Homberg: body has his screen name (otto_king):
    ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name otto_king embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007'),    
    # poster Otto van Homberg: body has his full name (Otto van Homberg):
    ('otto_king','Comment', 'False', 'False', '[]', 5L, 'Body with poster screen name Otto van Homberg embedded.', 'MITx/6.002x/2012_Fall', datetime.datetime(2013, 5, 16, 4, 32, 21), "{u'count': 0, u'point': 0, u'down_count': 0, u'up': [], u'down': [], u'up_count': 0}", 0L, 0L, 0L, '[]', '[]', '519461545924670200000001', '519461555924670200000006', "[u'519461555924670200000006']", '519461555924670200000006-519461555924670200000007')    
    ]    

    def setUp(self):
        
        self.mongoDb = MongoDB(dbName="unittest", collection="tinyForum")
        # Fill the little MongoDB with test JSON lines
        self.resetMongoTestDb()
        
        self.mysqldb = MySQLDB(mySQLUser='******', db='unittest')
        # Start with an empty result MySQL table for each test:
        self.mysqldb.dropTable('contents')
        # Fill the fake UserGrade table with records of course participants:
        self.resetMySQLUserListDb()
        
        # Instantiate a Forum scrubber without the 
        # name of a bson file that contains forum
        # records. That 'None' for the bson file will
        # make the class understand that it's being
        # instantiated for a unit test. 
        self.forumScrubberAnonymized = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade')
        self.forumScrubberRelatable  = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', allowAnonScreenName=True)
        self.forumScrubberClear      = EdxForumScrubber(None, mysqlDbObj=self.mysqldb, forumTableName='contents', allUsersTableName='unittest.UserGrade', anonymize=False)

    def tearDown(self):
        self.mysqldb.close()

    @unittest.skipIf(not RUN_ALL_TESTS, 
                     'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.')    
    def testAnonymized(self):
        self.forumScrubberAnonymized.populateUserCache()
        self.forumScrubberAnonymized.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents')  
        for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')):
            # print(str(rowNum) + ':' + str(forumPost))
            self.assertEqual(TestForumEtl.tinyForumGoldAnonymized[rowNum], forumPost)
            
    @unittest.skipIf(not RUN_ALL_TESTS, 
                     'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.')    
    def testNonAnonymizedRelatable(self):
        self.forumScrubberRelatable.populateUserCache()
        self.forumScrubberRelatable.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents')  
        for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')):
            # print(str(rowNum) + ':' + str(forumPost))
            self.assertEqual(TestForumEtl.tinyForumGoldRelatable[rowNum], forumPost)

    @unittest.skipIf(not RUN_ALL_TESTS, 
                     'Uncomment this decoration if RUN_ALL_TESTS is False, and you want to run just this test.')    
    def testNonAnonymized(self):
        self.forumScrubberClear.populateUserCache()
        self.forumScrubberClear.forumMongoToRelational(self.mongoDb, self.mysqldb, 'contents')  
        for rowNum, forumPost in enumerate(self.mysqldb.query('SELECT * FROM unittest.contents')):
            # print(str(rowNum) + ':' + str(forumPost))
            self.assertEqual(TestForumEtl.tinyForumGoldClear[rowNum], forumPost)


    
    def resetMongoTestDb(self):
        self.mongoDb.clearCollection()
        # Use small, known forum collection:
        currDir = os.path.dirname(__file__)     
        with open(os.path.join(currDir, 'data/tinyForum.json'), 'r') as jsonFd:
            for line in jsonFd:
                forumPost = json.loads(line)
                self.mongoDb.insert(forumPost)

    def resetMySQLUserListDb(self):
        '''
        Prepare a MySQL table that mimicks EdxPrivate.UserGrade.
        '''
        
        userGradeColSpecs = OrderedDict(
                                        {
                                         'name' : 'varchar(255)',
                                         'screen_name' : 'varchar(255)',
                                         'grade' : 'int',
                                         'course_id' : 'varchar(255)',
                                         'distinction' : 'tinyint',
                                         'status' : 'varchar(50)',
                                         'user_int_id' : 'int(11)',
                                         'anon_screen_name' : 'varchar(40)'
                                         })
        self.mysqldb.dropTable('UserGrade')
        self.mysqldb.createTable('UserGrade', userGradeColSpecs)
        self.mysqldb.bulkInsert('UserGrade', 
                                ('name','screen_name','grade','course_id','distinction','status','user_int_id','anon_screen_name'),
                                [
                                 ('Otto van Homberg','otto_king',5,'oldCourse',0,'notpassing',5,'abc'),
                                 ('Andreas Fritz','fritzL',2,'newCourse',0,'notpassing',7,'def'),
                                 ('Bebe Winter', 'bebeW',10,'History of Baking',1,'passing',10,'ghi')
                                 ])
class MongoTest(unittest.TestCase):
    '''
    Test the mongodb.py module. Uses a library that fakes
    a MongoDB server. See https://pypi.python.org/pypi/mongomock/1.0.1
    '''

    def setUp(self):
        self.objs = [{"fname" : "Franco", "lname" : "Corelli"}, 
                     {"fname" : "Leonardo", "lname" : "DaVinci", "age" : 300},
                     {"fname" : "Franco", "lname" : "Gandolpho"}]
        
        self.mongodb = MongoDB(dbName='unittest', collection='unittest')
        self.mongodb.clearCollection(collection="unittest")
        self.mongodb.clearCollection(collection="new_coll")
        self.mongodb.setCollection("unittest")

    def tearDown(self):
        self.mongodb.dropCollection(collection='unittest')
        self.mongodb.dropCollection(collection='new_coll')
        self.mongodb.close()

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_update_and_find_one(self):
        self.mongodb.insert(self.objs[0])
        # Get a generator for the results:
        resGen = self.mongodb.query({"fname" : "Franco"}, limit=1, collection="unittest")
        res = resGen.next()
        self.assertEqual('Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname']))

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_set_coll_use_different_coll(self):
        # Insert into unittest:
        self.mongodb.insert(self.objs[0])
        # Switch to new_coll:
        self.mongodb.setCollection('new_coll')
        self.mongodb.insert({"recommendation" : "Hawaii"})
        
        # We're in new_coll; the following should be empty result:
        self.mongodb.query({"fname" : "Franco"}, limit=1)
        resCount = self.mongodb.resultCount({"fname" : "Franco"})
        self.assertIsNone(resCount, "Got non-null result that should be null: %s" % resCount)
        
        # But this search is within new_coll, and should succeed:
        resGen = self.mongodb.query({"recommendation" : {'$regex' : '.*'}}, limit=1)
        res = resGen.next()
        self.assertEqual('Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation']))
        
        # Try inline collection switch:
        resGen = self.mongodb.query({"fname" : "Franco"}, limit=1, collection="unittest")
        res = resGen.next()
        self.assertEqual('Corelli', res['lname'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Corelli', res['lname']))

        # But the default collection should still be new_coll,
        # so a search with unspecified coll should be in new_coll:
        resGen = self.mongodb.query({"recommendation" : {'$regex' : '.*'}}, limit=1)
        res = resGen.next()
        self.assertEqual('Hawaii', res['recommendation'], "Failed retrieval of single obj; expected '%s' but got '%s'" % ('Hawaii', res['recommendation']))

    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_multi_result(self):
        # Insert two docs with fname == Franco:
        self.mongodb.insert(self.objs[0])
        self.mongodb.insert(self.objs[2])
        resGen = self.mongodb.query({"fname" : "Franco"})
        # To get result count, must retrieve at least one result first:
        resGen.next()
        resCount = self.mongodb.resultCount({"fname" : "Franco"})
        if resCount != 2:
            self.fail("Added two Franco objects, but only %s are found." % str(resCount))
            
    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_clear_collection(self):
        self.mongodb.insert({"foo" : 10})
        resGen = self.mongodb.query({"foo" : 10}, limit=1)
        res = resGen.next()
        self.assertIsNotNone(res, "Did not find document that was just inserted.")
        self.mongodb.clearCollection()
        resGen = self.mongodb.query({"foo" : 10}, limit=1)
        self.assertRaises(StopIteration, resGen.next)
        
    @unittest.skipIf(not TEST_ALL, "Skipping")
    def test_only_some_return_columns(self):
        # Also tests the suppression of _id col when desired:
        self.mongodb.insert(self.objs[0])
        self.mongodb.insert(self.objs[1])
        resGen = self.mongodb.query({}, ("lname"))
        names = []
        for lnameDict in resGen:
            resCount = self.mongodb.resultCount({})
            self.assertEqual(2, resCount)
            names.append(lnameDict['lname'])
        self.assertItemsEqual(['Corelli','DaVinci'], names, "Did not receive expected lnames: %s" % str(names))