class EdxForumScrubber(object): ''' Given a .bson file of OpenEdX Forum posts, load the file into a MongoDB. Then pull a post at a time, anonymize, and insert a selection of fields into a MySQL db. The MongoDb entries look like this:: { "_id" : ObjectId("51b75a48f359c40a00000028"), "_type" : "Comment", "abuse_flaggers" : [ ], "anonymous" : false, "anonymous_to_peers" : false, "at_position_list" : [ ], "author_id" : "26344", "author_username" : "Minelly48", "body" : "I am Gwen.I am a nursing professor who took statistics many years ago and want to refresh my knowledge.", "comment_thread_id" : ObjectId("51b754e5f359c40a0000001d"), "course_id" : "Medicine/HRP258/Statistics_in_Medicine", "created_at" : ISODate("2013-06-11T17:11:36.831Z"), "endorsed" : false, "historical_abuse_flaggers" : [ ], "parent_ids" : [ ], "updated_at" : ISODate("2013-06-11T17:11:36.831Z"), "visible" : true, "votes" : { "count" : 2, "down" : [ ], "down_count" : 0, "point" : 2, "up" : [ "40325", "20323" ], "up_count" : 2 }, "sk" : "51b75a48f359c40a00000028" } Depending on parameter allowAnonScreenName in the __init__() method, forum entries in the relational database will be associated with the same hash that is used to anonymize other parts of the OpenEdX data. ''' LOG_DIR = '/home/dataman/Data/EdX/NonTransformLogs' # Pattern for email id - strings of alphabets/numbers/dots/hyphens followed # by an @ or at followed by combinations of dot/. followed by the edu/com # also, allow for spaces emailPattern='(.*)\s+([a-zA-Z0-9\(\.\-]+)[@]([a-zA-Z0-9\.]+)(.)(edu|com)\\s*(.*)' #emailPattern='(.*)\\s+([a-zA-Z0-9\\.]+)\\s*(\\(f.*b.*)?(@)\\s*([a-zA-Z0-9\\.\\s;]+)\\s*(\\.)\\s*(edu|com)\\s+(.*)' compiledEmailPattern = re.compile(emailPattern); # Pattern for replacing embedded double quotes in post bodies, # unless they are already escaped w/ a backslash. The # {0,1} means a match if zero or one repetition. It's # needed so that double quotes at the very start of a # string are matched: no preceding character at all: #doublQuoteReplPattern = re.compile(r'[^\\]{0,1}"') doublQuoteReplPattern = re.compile(r'[\\]{0,}"') # Schema of EdxForum.contents: an ordered dict that is # used twice: the table creation MySQL command is constructed # from this dict, and the dict is used to ensure that # all its keys (i.e. future column names) are present # in each MongoDB object. See also createForumTable(). # In createForumTable() either entry anon_screen_name, # or screen_name in the dict below will be deleted, based # on whether we are asked to anonymize or not: forumSchema = OrderedDict({}) forumSchema['forum_post_id'] = "varchar(40) NOT NULL DEFAULT 'unavailable'" forumSchema['anon_screen_name'] = "varchar(40) NOT NULL DEFAULT 'anon_screen_name_redacted'" # This or next deleted based on anonymize yes/no forumSchema['screen_name'] = "varchar(40) NOT NULL DEFAULT 'anon_screen_name_redacted'" # This or prev deleted based on anonymize yes/no forumSchema['type'] = "varchar(20) NOT NULL" forumSchema['anonymous'] = "varchar(10) NOT NULL" forumSchema['anonymous_to_peers'] = "varchar(10) NOT NULL" forumSchema['at_position_list'] = "varchar(200) NOT NULL" forumSchema['forum_uid'] = "varchar(40) NOT NULL" forumSchema['body'] = "TEXT NOT NULL" #"varchar(2500) NOT NULL" forumSchema['course_display_name'] = "varchar(100) NOT NULL" forumSchema['created_at'] = "datetime NOT NULL" forumSchema['votes'] = "TEXT NOT NULL" # "varchar(200) NOT NULL" forumSchema['count'] = "int(11) NOT NULL" forumSchema['down_count'] = "int(11) NOT NULL" forumSchema['up_count'] = "int(11) NOT NULL" forumSchema['up'] = "varchar(200) DEFAULT NULL" forumSchema['down'] = "varchar(200) DEFAULT NULL" forumSchema['comment_thread_id'] = "varchar(255) DEFAULT NULL" forumSchema['parent_id'] = "varchar(255) DEFAULT NULL" forumSchema['parent_ids'] = "varchar(255) DEFAULT NULL" forumSchema['sk'] = "varchar(255) DEFAULT NULL" forumSchema['confusion'] = "varchar(20) NOT NULL DEFAULT ''" forumSchema['happiness'] = "varchar(20) NOT NULL DEFAULT ''" def __init__(self, bsonFileName, mysqlDbObj=None, forumTableName='contents', allUsersTableName='EdxPrivate.UserGrade', anonymize=True, allowAnonScreenName=False): ''' Given a .bson file containing OpenEdX Forum entries, anonymize the entries (if desired), and place them into a MySQL table. :param bsonFileName: full path the .bson table. Set to None if instantiating for unit testing. :type bsonFileName: String :param mysqlDbObj: a pymysql_utils.MySQLDB object where anonymized entries are to be placed. If None, a new such object is created into MySQL db 'EdxForum' :type mysqlDbObj: MySQLDB :param forumTableName: name of table into which anonymized Forum entries are to be placed :type forumTableName: String :param allUsersTable: fully qualified name of table listing all in-the-clear mySQLUser names of users who post to the Forum. Used to redact their names from their own posts. :type allUsersTable: String :param anonymize: If true, Forum post entries in the MySQL table will be anonymized :type anonymize: bool :param allow_anon_screen_name: if True, then occurrences of poster's name in post bodies are replaced by <redacName_<anon_screen_name>>, where anon_screen_name is the hash used in other tables of the OpenEdX data. :type allow_anon_screen_name: Bool ''' self.bsonFileName = bsonFileName self.forumTableName = forumTableName self.forumDbName = 'EdxForum' self.allUsersTableName = allUsersTableName self.anonymize = anonymize self.allowAnonScreenName = allowAnonScreenName # If not unittest, but regular run, then mysqlDbObj is None if mysqlDbObj is None: self.mysql_passwd = self.getMySQLPasswd() self.mysql_dbhost ='localhost' self.mysql_user = getpass.getuser() # mySQLUser that started this process self.mydb = MySQLDB(user=self.mysql_user, passwd=self.mysql_passwd, db=self.forumDbName) else: self.mydb = mysqlDbObj self.counter=0 self.userCache = {} self.userSet = set() warnings.filterwarnings('ignore', category=MySQLdb.Warning) self.setupLogging() self.prepDatabase() #******mysqldb.commit(); #******logging.info('commit completed!') def runConversion(self): ''' Do the actual work. We don't call this method from __init__() so that unittests can create an EdxForumScrubber instance without doing the actual work. Instead, unittests call individual methods. ''' self.populateUserCache(); self.mongo_database_name = 'TmpForum' self.collection_name = 'contents' # Load bson file into Mongodb: self.loadForumIntoMongoDb(self.bsonFileName) self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name) # Anonymize each forum record, and transfer to MySQL db: self.forumMongoToRelational(self.mongodb, self.mydb,'contents' ) self.mydb.close() self.mongodb.close() self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + '.' + self.forumTableName)) def loadForumIntoMongoDb(self, bsonFilename): mongoclient = MongoClient(); db = mongoclient[self.mongo_database_name]; # Get collection object: collection = db[self.collection_name]; # Clear out any old forum entries: self.logInfo('Preparing to delete the collection ') collection.remove() self.logInfo('Deleting mongo collection completed. Will now attempt a mongo restore') self.logInfo('Spawning subprocess to execute mongo restore') with open(self.logFilePath,'w') as outfile: ret = subprocess.call( ['mongorestore', '--drop', '--db', self.mongo_database_name, '--collection', self.collection_name, bsonFilename], stdout=outfile, stderr=outfile) self.logDebug('Return value from mongorestore is %s' % (ret)) objCount = subprocess.check_output( ['mongo', '--quiet', '--eval', 'printjson(db.contents.count())', self.mongo_database_name, ], stderr=outfile) self.numMongoItems = objCount self.logInfo('Available Forum posts %s' % objCount) def forumMongoToRelational(self, mongodb, mysqlDbObj, mysqlTable): ''' Given a pymongo collection object in which Forum posts are stored, and a MySQL db object and table name, anonymize each mongo record, and insert it into the MySQL table. :param collection: collection object obtained via a mangoclient object :type collection: Collection :param mysqlDbObj: wrapper to MySQL db. See pymysql_utils.py :type mysqlDbObj: MYSQLDB :param mysqlTable: name of table where posts are to be deposited. Example: 'contents'. :type mysqlTable: String ''' #command = 'mongorestore %s -db %s -mongoForumRec %s'%(self.bson_filename,self.mongo_database_name,self.collection_name) #print command self.logInfo('Will start inserting from mongo collection to MySQL') for mongoForumRec in mongodb.query({}): mongoRecordObj = MongoRecord(mongoForumRec) try: # Check whether 'up' can be converted to a list list(mongoRecordObj['up']) except Exception as e: self.logInfo("Error in conversion of 'up' field to a list (setting cell to -1):" + `e`) mongoRecordObj['up'] ='-1' # Make sure the MongoDB object has all fields that will # be needed for the forum schema: self.ensureSchemaAdherence(mongoRecordObj) self.insert_content_record(mysqlDbObj, mysqlTable, mongoRecordObj); def prepDatabase(self): ''' Declare variables and execute statements preparing the database to configure options - e.g.: setting char set to utf, connection type to utf truncating the already existing table. ''' try: self.logDebug("Setting and assigning char set for mysqld. will truncate old values") self.mydb.execute('SET NAMES utf8;'); self.mydb.execute('SET CHARACTER SET utf8;'); self.mydb.execute('SET character_set_connection=utf8;'); # Compose fully qualified table name from the db name to # which self.mydb is connected, and the forum table name # that was established in __init__(): fullTblName = self.mydb.dbName() + '.' + self.forumTableName # Clear old forum data out of the table: try: self.mydb.dropTable(fullTblName) # Create MySQL table for the posts. If we are to # anonymize, the poster name column will be 'screen_name', # else it will be 'anon_screen_name': self.createForumTable(self.anonymize) self.logDebug("setting and assigning char set complete. Truncation succeeded") except ValueError as e: self.logDebug("Failed either to set character codes, or to create forum table %s: %s" % (fullTblName, `e`)) except MySQLdb.Error,e: self.logInfo("MySql Error exiting %d: %s" % (e.args[0],e.args[1])) # print e sys.exit(1)
class TestPymysqlUtils(unittest.TestCase): ''' Tests pymysql_utils. ''' @classmethod def setUpClass(cls): # Ensure that a user unittest with the proper # permissions exists in the db: TestPymysqlUtils.env_ok = True TestPymysqlUtils.err_msg = '' try: needed_grants = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'CREATE TEMPORARY TABLES', 'DROP', 'ALTER'] mysqldb = MySQLDB(host='localhost', port=3306, user='******', db='unittest') grant_query = 'SHOW GRANTS FOR unittest@localhost' query_it = mysqldb.query(grant_query) # First row of the SHOW GRANTS response should be # one of: first_grants = ["GRANT USAGE ON *.* TO 'unittest'@'localhost'", "GRANT USAGE ON *.* TO `unittest`@`localhost`" ] # Second row depends on the order in which the # grants were provided. The row will look something # like: # GRANT SELECT, INSERT, UPDATE, DELETE, ..., CREATE, DROP, ALTER ON `unittest`.* TO 'unittest'@'localhost' # Verify: usage_grant = query_it.next() if usage_grant not in first_grants: TestPymysqlUtils.err_msg = ''' User 'unittest' is missing USAGE grant needed to run the tests. Also need this in your MySQL: %s ''' % 'GRANT %s ON unittest.* TO unittest@localhost' % ','.join(needed_grants) TestPymysqlUtils.env_ok = False return grants_str = query_it.next() for needed_grant in needed_grants: if grants_str.find(needed_grant) == -1: TestPymysqlUtils.err_msg = ''' User 'unittest' does not have the '%s' permission needed to run the tests. Need this in your MySQL: %s ''' % (needed_grant, 'GRANT %s ON unittest.* TO unittest@localhost;' % ','.join(needed_grants)) TestPymysqlUtils.env_ok = False return except (ValueError,RuntimeError): TestPymysqlUtils.err_msg = ''' For unit testing, localhost MySQL server must have user 'unittest' without password, and a database called 'unittest'. To create these prerequisites in MySQL: CREATE USER unittest@localhost; CREATE DATABASE unittest; This user needs permissions: %s ''' % 'GRANT %s ON unittest.* TO unittest@localhost;' % ','.join(needed_grants) TestPymysqlUtils.env_ok = False # Check MySQL version: try: (major, minor) = TestPymysqlUtils.get_mysql_version() except Exception as e: raise OSError('Could not get mysql version number: %s' % str(e)) if major is None: print('Warning: MySQL version number not found; testing as if V5.7') TestPymysqlUtils.major = 5 TestPymysqlUtils.minor = 7 else: TestPymysqlUtils.major = major TestPymysqlUtils.minor = minor known_versions = [(5,6), (5,7), (8,0)] if (major,minor) not in known_versions: print('Warning: MySQL version is %s.%s; but testing as if V5.7') TestPymysqlUtils.major = 5 TestPymysqlUtils.minor = 7 def setUp(self): if not TestPymysqlUtils.env_ok: raise RuntimeError(TestPymysqlUtils.err_msg) try: self.mysqldb = MySQLDB(host='localhost', port=3306, user='******', db='unittest') except ValueError as e: self.fail(str(e) + " (For unit testing, localhost MySQL server must have user 'unittest' without password, and a database called 'unittest')") # Make MySQL version more convenient to check: if (TestPymysqlUtils.major == 5 and TestPymysqlUtils.minor >= 7) or \ TestPymysqlUtils.major >= 8: self.mysql_ge_5_7 = True else: self.mysql_ge_5_7 = False def tearDown(self): if self.mysqldb.isOpen(): self.mysqldb.dropTable('unittest') # Make sure the test didn't set a password # for user unittest in the db: self.mysqldb.execute("SET PASSWORD FOR unittest@localhost = '';") self.mysqldb.close() # ----------------------- Table Manilupation ------------------------- #------------------------- # Creating and Dropping Tables #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testCreateAndDropTable(self): mySchema = { 'col1' : 'INT', 'col2' : 'varchar(255)', 'col3' : 'FLOAT', 'col4' : 'TEXT', #'col5' : 'JSON' # Only works MySQL 5.7 and up. } self.mysqldb.createTable('myTbl', mySchema, temporary=False) # Get (('col4', 'text'), ('col2', 'varchar(255)'), ('col3', 'float'), ('col1', 'int(11)')) # in some order: cols = self.mysqldb.query('''SELECT COLUMN_NAME,COLUMN_TYPE FROM information_schema.columns WHERE TABLE_SCHEMA = 'unittest' AND TABLE_NAME = 'myTbl'; ''' ) self.assertEqual(sorted(cols), [('col1', 'int(11)'), ('col2', 'varchar(255)'), ('col3', 'float'), ('col4', 'text')] ) # Query mysql information schema to check for table # present. Use raw cursor to test independently from # the pymysql_utils query() method: self.mysqldb.dropTable('myTbl') cursor = self.mysqldb.connection.cursor() tbl_exists_query = ''' SELECT table_name FROM information_schema.tables WHERE table_schema = 'unittest' AND table_name = 'myTbl'; ''' cursor.execute(tbl_exists_query) self.assertEqual(cursor.rowcount, 0) cursor.close() #------------------------- # Creating Temporary Tables #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testCreateTempTable(self): mySchema = { 'col1' : 'INT', 'col2' : 'varchar(255)', 'col3' : 'FLOAT', 'col4' : 'TEXT', #'col5' : 'JSON' # Only works MySQL 5.7 and up. } self.mysqldb.createTable('myTbl', mySchema, temporary=True) # Check that tbl exists. # NOTE: can't use query to mysql.informationschema, # b/c temp tables aren't listed there. try: # Will return some tuple; we don't # care what exaclty, as long as the # cmd doesn't fail: self.mysqldb.query('DESC myTbl').next() except Exception: self.fail('Temporary table not found after creation.') # Start new session, which should remove the table. # Query mysql information schema to check for table # present. Use raw cursor to test independently from # the pymysql_utils query() method: self.mysqldb.close() try: self.mysqldb = MySQLDB(host='localhost', port=3306, user='******', db='unittest') except ValueError as e: self.fail(str(e) + "Could not re-establish MySQL connection.") # NOTE: can't use query to mysql.informationschema, # b/c temp tables aren't listed there. try: self.mysqldb.query('DESC myTbl').next() self.fail("Temporary table did not disappear with session exit.") except ValueError: pass #------------------------- # Table Truncation #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testTruncate(self): # Initial test db with known num of rows: rows_in_test_db = self.buildSmallDb() cursor = self.mysqldb.connection.cursor() cursor.execute('SELECT * FROM unittest;') self.assertEqual(cursor.rowcount, rows_in_test_db) self.mysqldb.truncateTable('unittest') cursor.execute('SELECT * FROM unittest;') self.assertEqual(cursor.rowcount, 0) cursor.close() # ----------------------- Insertion and Update ------------------------- #------------------------- # Insert One Row #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testInsert(self): schema = OrderedDict([('col1', 'INT'), ('col2', 'TEXT')]) self.mysqldb.createTable('unittest', schema) colnameValueDict = OrderedDict([('col1', 10)]) self.mysqldb.insert('unittest', colnameValueDict) self.assertEqual((10, None), self.mysqldb.query("SELECT * FROM unittest").next()) # for value in self.mysqldb.query("SELECT * FROM unittest"): # print value # Insert row with an explicit None: colnameValueDict = OrderedDict([('col1', None)]) self.mysqldb.insert('unittest', colnameValueDict) cursor = self.mysqldb.connection.cursor() cursor.execute('SELECT col1 FROM unittest') # Swallow the first row: 10, Null: cursor.fetchone() # Get col1 of the row we added (the 2nd row): val = cursor.fetchone() self.assertEqual(val, (None,)) cursor.close() #------------------------- # Insert One Row With Error #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testInsertWithError(self): schema = OrderedDict([('col1', 'INT'), ('col2', 'TEXT')]) self.mysqldb.createTable('unittest', schema) colnameValueDict = OrderedDict([('col1', 10)]) (errors,warnings) = self.mysqldb.insert('unittest', colnameValueDict) self.assertIsNone(errors) self.assertIsNone(warnings) self.assertEqual((10, None), self.mysqldb.query("SELECT * FROM unittest").next()) # for value in self.mysqldb.query("SELECT * FROM unittest"): # print value #------------------------- # Insert Several Columns #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testInsertSeveralColumns(self): schema = OrderedDict([('col1', 'INT'), ('col2', 'TEXT')]) self.mysqldb.createTable('unittest', schema) colnameValueDict = OrderedDict([('col1', 10), ('col2', 'My Poem')]) self.mysqldb.insert('unittest', colnameValueDict) res = self.mysqldb.query("SELECT * FROM unittest").next() self.assertEqual((10, 'My Poem'), res) #------------------------- # Bulk Insertion #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testBulkInsert(self): # Called twice: once by the unittest engine, # and again by testWithMySQLPassword() to # exercise the pwd-bound branch in bulkInsert(). # Build test db (this already tests basic bulkinsert): # col1 col2 # 10, 'col1' # 20, 'col2' # 30, 'col3' self.buildSmallDb() self.mysqldb.execute('ALTER TABLE unittest ADD PRIMARY KEY(col1)') # Provoke a MySQL error: duplicate primary key (i.e. 10): # Add another row: 10, 'newCol1': colNames = ['col1', 'col2'] colValues = [(10, 'newCol1')] (errors, warnings) = self.mysqldb.bulkInsert('unittest', colNames, colValues) #@UnusedVariable # For MySQL 5.7, expect something like: # ((u'Warning', 1062L, u"Duplicate entry '10' for key 'PRIMARY'"),) # MySQL 5.6 just skips: if self.mysql_ge_5_7: self.assertEqual(len(warnings), 1) else: self.assertIsNone(warnings) # First tuple should still be (10, 'col1'): self.assertEqual('col1', self.mysqldb.query('SELECT col2 FROM unittest WHERE col1 = 10').next()) # Try update again, but with replacement: (errors, warnings) = self.mysqldb.bulkInsert('unittest', colNames, colValues, onDupKey=DupKeyAction.REPLACE) #@UnusedVariable self.assertIsNone(warnings) # Now row should have changed: self.assertEqual('newCol1', self.mysqldb.query('SELECT col2 FROM unittest WHERE col1 = 10').next()) # Insert a row with duplicate key, specifying IGNORE: colNames = ['col1', 'col2'] colValues = [(10, 'newCol2')] (errors, warnings) = self.mysqldb.bulkInsert('unittest', colNames, colValues, onDupKey=DupKeyAction.IGNORE) #@UnusedVariable # Even when ignoring dup keys, MySQL 5.7/8.x issue a warning # for each dup key: if self.mysql_ge_5_7: self.assertEqual(len(warnings), 1) else: self.assertIsNone(warnings) self.assertEqual('newCol1', self.mysqldb.query('SELECT col2 FROM unittest WHERE col1 = 10').next()) # Insertions that include NULL values: colValues = [(40, None), (50, None)] (errors, warnings) = self.mysqldb.bulkInsert('unittest', colNames, colValues) #@UnusedVariable self.assertEqual(None, self.mysqldb.query('SELECT col2 FROM unittest WHERE col1 = 40').next()) self.assertEqual(None, self.mysqldb.query('SELECT col2 FROM unittest WHERE col1 = 50').next()) # Provoke an error: colNames = ['col1', 'col2', 'col3'] colValues = [(10, 'newCol2')] (errors, warnings) = self.mysqldb.bulkInsert('unittest', colNames, colValues, onDupKey=DupKeyAction.IGNORE) #@UnusedVariable self.assertEqual(len(errors), 1) #------------------------- # Updates #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testUpdate(self): num_rows = self.buildSmallDb() cursor = self.mysqldb.connection.cursor() # Initially, col2 of row0 must be 'col1': cursor.execute('SELECT col2 FROM unittest WHERE col1 = 10') col2_row_zero = cursor.fetchone() self.assertTupleEqual(col2_row_zero, ('col1',)) self.mysqldb.update('unittest', 'col1', 40, fromCondition='col1 = 10') # Now no col1 with value 10 should exist: cursor.execute('SELECT col2 FROM unittest WHERE col1 = 10') self.assertEqual(cursor.rowcount, 0) # But a row with col1 == 40 should have col2 == 'col1': cursor.execute('SELECT col2 FROM unittest WHERE col1 = 40') col2_res = cursor.fetchone() self.assertTupleEqual(col2_res, ('col1',)) # Update *all* rows in one column: self.mysqldb.update('unittest', 'col1', 0) cursor.execute('SELECT count(*) FROM unittest WHERE col1 = 0') res_count = cursor.fetchone() self.assertTupleEqual(res_count, (num_rows,)) # Update with a MySQL NULL value by using Python None # for input and output: self.mysqldb.update('unittest', 'col1', None) cursor.execute('SELECT count(*) FROM unittest WHERE col1 is %s', (None,)) res_count = cursor.fetchone() self.assertTupleEqual(res_count, (num_rows,)) # Update with a MySQL NULL value by using Python None # with WHERE clause: only set col1 to NULL where col2 = 'col2', # i.e. in the 2nd row: num_rows = self.buildSmallDb() self.mysqldb.update('unittest', 'col1', None, "col2 = 'col2'") cursor.execute('SELECT count(*) FROM unittest WHERE col1 is %s', (None,)) res_count = cursor.fetchone() self.assertTupleEqual(res_count, (1,)) # Provoke an error: (errors,warnings) = self.mysqldb.update('unittest', 'col6', 40, fromCondition='col1 = 10') #@UnusedVariable self.assertEqual(len(errors), 1) cursor.close() # ----------------------- Queries ------------------------- #------------------------- # Query With Result Iteration #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testQueryIterator(self): self.buildSmallDb() for rowNum, result in enumerate(self.mysqldb.query('SELECT col1,col2 FROM unittest')): if rowNum == 0: self.assertEqual((10, 'col1'), result) elif rowNum == 1: self.assertEqual((20, 'col2'), result) elif rowNum == 2: self.assertEqual((30, 'col3'), result) # Test the dict cursor self.mysqldb.close() self.mysqldb = MySQLDB(host='localhost', user='******', db='unittest', cursor_class=Cursors.DICT) for result in self.mysqldb.query('SELECT col1,col2 FROM unittest'): self.assertIsInstance(result, dict) if result['col1'] == 10: self.assertEqual(result['col2'], 'col1') elif result['col1'] == 20: self.assertEqual(result['col2'], 'col2') elif result['col1'] == 30: self.assertEqual(result['col2'], 'col3') #------------------------- # Query Unparameterized #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testExecuteArbitraryQuery(self): self.buildSmallDb() self.mysqldb.execute("UPDATE unittest SET col1=120") for result in self.mysqldb.query('SELECT col1 FROM unittest'): self.assertEqual(120, result) #------------------------- # Query Parameterized #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testExecuteArbitraryQueryParameterized(self): self.buildSmallDb() myVal = 130 self.mysqldb.executeParameterized("UPDATE unittest SET col1=%s", (myVal,)) for result in self.mysqldb.query('SELECT col1 FROM unittest'): self.assertEqual(130, result) # Provoke an error: (errors,warnings) = self.mysqldb.executeParameterized("UPDATE unittest SET col10=%s", (myVal,)) #@UnusedVariable self.assertEqual(len(errors), 1) #------------------------- # Reading System Variables #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testReadSysVariable(self): this_host = socket.gethostname() mysql_hostname = self.mysqldb.query('SELECT @@hostname').next() self.assertIn(mysql_hostname, [this_host, 'localhost']) #------------------------- # User-Level Variables #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testUserVariables(self): pre_foo = self.mysqldb.query("SELECT @foo").next() self.assertEqual(pre_foo, None) self.mysqldb.execute("SET @foo = 'new value';") post_foo = self.mysqldb.query("SELECT @foo").next() self.assertEqual(post_foo, 'new value') self.mysqldb.execute("SET @foo = 'NULL';") #------------------------- # testDbName #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testDbName(self): self.assertEqual(self.mysqldb.dbName(), 'unittest') @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testWithMySQLPassword(self): try: # Set a password for the unittest user: if self.mysql_ge_5_7: self.mysqldb.execute("SET PASSWORD FOR unittest@localhost = 'foobar'") else: self.mysqldb.execute("SET PASSWORD FOR unittest@localhost = PASSWORD('foobar')") self.mysqldb.close() # We should be unable to log in without a pwd: with self.assertRaises(ValueError): self.mysqldb = MySQLDB(host='localhost', user='******', db='unittest') # Open new pymysql_db.MySQLDb instance, supplying pwd: self.mysqldb = MySQLDB(host='localhost', user='******', passwd='foobar', db='unittest') # Do a test query: self.buildSmallDb() res = self.mysqldb.query("SELECT col2 FROM unittest WHERE col1 = 10;").next() self.assertEqual(res, 'col1') # Bulk insert is also different for pwd vs. none: self.testBulkInsert() finally: # Make sure the remove the pwd from user unittest, # so that other tests will run successfully: if self.mysql_ge_5_7: self.mysqldb.execute("SET PASSWORD FOR unittest@localhost = ''") else: self.mysqldb.execute("SET PASSWORD FOR unittest@localhost = PASSWORD('')") #------------------------- # testResultCount #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testResultCount(self): self.buildSmallDb() query_str = 'SELECT * FROM unittest' self.mysqldb.query(query_str) self.assertEqual(self.mysqldb.result_count(query_str), 3) #------------------------- # testInterleavedQueries #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testInterleavedQueries(self): self.buildSmallDb() query_str1 = 'SELECT col2 FROM unittest ORDER BY col1' query_str2 = 'SELECT col2 FROM unittest WHERE col1 = 20 or col1 = 30 ORDER BY col1' res_it1 = self.mysqldb.query(query_str1) res_it2 = self.mysqldb.query(query_str2) self.assertEqual(res_it1.result_count(), 3) self.assertEqual(res_it2.result_count(), 2) self.assertEqual(self.mysqldb.result_count(query_str1), 3) self.assertEqual(self.mysqldb.result_count(query_str2), 2) self.assertEqual(res_it1.next(), 'col1') self.assertEqual(res_it2.next(), 'col2') self.assertEqual(res_it1.result_count(), 3) self.assertEqual(res_it2.result_count(), 2) self.assertEqual(self.mysqldb.result_count(query_str1), 3) self.assertEqual(self.mysqldb.result_count(query_str2), 2) self.assertEqual(res_it1.next(), 'col2') self.assertEqual(res_it2.next(), 'col3') self.assertEqual(res_it1.next(), 'col3') with self.assertRaises(StopIteration): res_it2.next() with self.assertRaises(ValueError): res_it2.result_count() with self.assertRaises(ValueError): self.mysqldb.result_count(query_str2) #------------------------- # testBadParameters #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testBadParameters(self): self.mysqldb.close() # Test setting parameters illegally to None: try: with self.assertRaises(Exception) as context: MySQLDB(host=None, port=3306, user='******', db='unittest') self.assertTrue("None value(s) for ['host']; none of host,port,user,passwd or db must be None" in str(context.exception)) with self.assertRaises(Exception) as context: MySQLDB(host='localhost', port=None, user='******', db='unittest') self.assertTrue("None value(s) for ['port']; none of host,port,user,passwd or db must be None" in str(context.exception)) with self.assertRaises(Exception) as context: MySQLDB(host='localhost', port=3306, user=None, db='unittest') self.assertTrue("None value(s) for ['user']; none of host,port,user,passwd or db must be None" in str(context.exception)) with self.assertRaises(Exception) as context: MySQLDB(host='localhost', port=3306, user='******', db=None) self.assertTrue("None value(s) for ['db']; none of host,port,user,passwd or db must be None" in str(context.exception)) with self.assertRaises(Exception) as context: MySQLDB(host='localhost', port=3306, user='******', passwd=None, db='unittest') self.assertTrue("None value(s) for ['passwd']; none of host,port,user,passwd or db must be None" in str(context.exception)) with self.assertRaises(Exception) as context: MySQLDB(host=None, port=3306, user=None, db=None) self.assertTrue("None value(s) for ['host', 'db', 'user']; none of host,port,user,passwd or db must be None" in str(context.exception)) except AssertionError: # Create a better message than 'False is not True'. # That useless msg is generated if an expected exception # above is NOT raised: raise AssertionError('Expected ValueError exception "%s" was not raised.' % context.exception.message) # Check data types of parameters: try: # One illegal type: host==10: with self.assertRaises(Exception) as context: # Integer instead of string for host: MySQLDB(host=10, port=3306, user='******', db='myDb') self.assertTrue("Value(s) ['host'] have bad type;host,user,passwd, and db must be strings; port must be int." in str(context.exception)) # Two illegal types: host and user: with self.assertRaises(Exception) as context: # Integer instead of string for host: MySQLDB(host=10, port=3306, user=30, db='myDb') self.assertTrue("Value(s) ['host', 'user'] have bad type;host,user,passwd, and db must be strings; port must be int." in str(context.exception)) # Port being string instead of required int: with self.assertRaises(Exception) as context: # Integer instead of string for host: MySQLDB(host='myHost', port='3306', user='******', db='myDb') self.assertTrue("Port must be an integer; was" in str(context.exception)) except AssertionError: # Create a better message than 'False is not True'. # That useless msg is generated if an expected exception # above is NOT raised: raise AssertionError('Expected ValueError exception "%s" was not raised.' % context.exception.message) #------------------------- # testIsOpen #-------------- @unittest.skipIf(not TEST_ALL, "Temporarily disabled") def testIsOpen(self): self.assertTrue(self.mysqldb.isOpen()) self.mysqldb.close() self.assertFalse(self.mysqldb.isOpen()) # ----------------------- UTILITIES ------------------------- #------------------------- # buildSmallDb #-------------- def buildSmallDb(self): ''' Creates a two-col, three-row table in database unittest. The table is called 'unittest'. Returns number of rows created. ==== ====== col1 col2 ==== ====== 10 'col1' 20 'col2' 30 'col3' ==== ====== ''' cur = self.mysqldb.connection.cursor() with no_warn_no_table(): cur.execute('DROP TABLE IF EXISTS unittest') cur.execute('CREATE TABLE unittest (col1 INT, col2 TEXT)') cur.execute("INSERT INTO unittest VALUES (10, 'col1')") cur.execute("INSERT INTO unittest VALUES (20, 'col2')") cur.execute("INSERT INTO unittest VALUES (30, 'col3')") self.mysqldb.connection.commit() cur.close() return 3 #------------------------- # get_mysql_version #-------------- @classmethod def get_mysql_version(cls): ''' Return a tuple: (major, minor). Example, for MySQL 5.7.15, return (5,7). Return (None,None) if version number not found. ''' # Where is mysql client program? mysql_path = MySQLDB.find_mysql_path() # Get version string, which looks like this: # 'Distrib 5.7.15, for osx10.11 (x86_64) using EditLine wrapper\n' version_str = subprocess.check_output([mysql_path, '--version']).decode('utf-8') # Isolate the major and minor version numbers (e.g. '5', and '7') pat = re.compile(r'([0-9]*)[.]([0-9]*)[.]') match_obj = pat.search(version_str) if match_obj is None: return (None,None) (major, minor) = match_obj.groups() return (int(major), int(minor)) # self.mysqldb.dropTable('unittest') # self.mysqldb.createTable('unittest', schema) # colNames = ['col1', 'col2'] # colValues = [(10, 'col1'), (20, 'col2'), (30, 'col3')] # warnings = self.mysqldb.bulkInsert('unittest', colNames, colValues) # self.assertIsNone(warnings) # return 3 #------------------------- # convert_to_string #-------------- def convert_to_string(self, strLike): ''' The str/byte/unicode type mess between Python 2.7 and 3.x. We want as 'normal' a string as possible. Surely there is a more elegant way. @param strLike: a Python 3 str (i.e. unicode string), a Python 3 binary str. a Python 2.7 unicode string, or a Python 2.7 str. @type strLike: {str|unicode|byte} ''' try: if type(strLike) == eval('unicode'): # Python 2.7 unicode --> str: strLike = strLike.encode('UTF-8') except NameError: pass try: if type(strLike) == eval('bytes'): # Python 3 byte string: strLike = strLike.decode('UTF-8') except NameError: pass return strLike #------------------------- # read_config_file_content #-------------- @classmethod def read_config_file_content(cls): ''' Read and return content of pymysql_utils.cnf.py ''' curr_dir = os.path.dirname(__file__) config_file_name = os.path.join(curr_dir, 'pymysql_utils.cnf.py') with open(config_file_name, 'r') as fd: return fd.read() #------------------------- # write_config_file_content #-------------- @classmethod def write_config_file_content(cls, content): curr_dir = os.path.dirname(__file__) config_file_name = os.path.join(curr_dir, 'pymysql_utils.cnf.py') with open(config_file_name, 'w') as fd: return fd.write(content)
class EdxForumScrubber(object): ''' Given a .bson file of OpenEdX Forum posts, load the file into a MongoDB. Then pull a post at a time, anonymize, and insert a selection of fields into a MySQL db. The MongoDb entries look like this:: { "_id" : ObjectId("51b75a48f359c40a00000028"), "_type" : "Comment", "abuse_flaggers" : [ ], "anonymous" : false, "anonymous_to_peers" : false, "at_position_list" : [ ], "author_id" : "26344", "author_username" : "Minelly48", "body" : "I am Gwen.I am a nursing professor who took statistics many years ago and want to refresh my knowledge.", "comment_thread_id" : ObjectId("51b754e5f359c40a0000001d"), "course_id" : "Medicine/HRP258/Statistics_in_Medicine", "created_at" : ISODate("2013-06-11T17:11:36.831Z"), "endorsed" : false, "historical_abuse_flaggers" : [ ], "parent_ids" : [ ], "updated_at" : ISODate("2013-06-11T17:11:36.831Z"), "visible" : true, "votes" : { "count" : 2, "down" : [ ], "down_count" : 0, "point" : 2, "up" : [ "40325", "20323" ], "up_count" : 2 }, "sk" : "51b75a48f359c40a00000028" } Depending on parameter allowAnonScreenName in the __init__() method, forum entries in the relational database will be associated with the same hash that is used to anonymize other parts of the OpenEdX data. ''' LOG_DIR = '/home/dataman/Data/EdX/NonTransformLogs' # Pattern for email id - strings of alphabets/numbers/dots/hyphens followed # by an @ or at followed by combinations of dot/. followed by the edu/com # also, allow for spaces emailPattern='(.*)\s+([a-zA-Z0-9\(\.\-]+)[@]([a-zA-Z0-9\.]+)(.)(edu|com)\\s*(.*)' #emailPattern='(.*)\\s+([a-zA-Z0-9\\.]+)\\s*(\\(f.*b.*)?(@)\\s*([a-zA-Z0-9\\.\\s;]+)\\s*(\\.)\\s*(edu|com)\\s+(.*)' compiledEmailPattern = re.compile(emailPattern); # Pattern for replacing embedded double quotes in post bodies, # unless they are already escaped w/ a backslash. The # {0,1} means a match if zero or one repetition. It's # needed so that double quotes at the very start of a # string are matched: no preceding character at all: #doublQuoteReplPattern = re.compile(r'[^\\]{0,1}"') doublQuoteReplPattern = re.compile(r'[\\]{0,}"') def __init__(self, bsonFileName, mysqlDbObj=None, forumTableName='contents', allUsersTableName='EdxPrivate.UserGrade', anonymize=True, allowAnonScreenName=False): ''' Given a .bson file containing OpenEdX Forum entries, anonymize the entries (if desired), and place them into a MySQL table. :param bsonFileName: full path the .bson table. Set to None if instantiating for unit testing. :type bsonFileName: String :param mysqlDbObj: a pymysql_utils.MySQLDB object where anonymized entries are to be placed. If None, a new such object is created into MySQL db 'EdxForum' :type mysqlDbObj: MySQLDB :param forumTableName: name of table into which anonymized Forum entries are to be placed :type forumTableName: String :param allUsersTable: fully qualified name of table listing all in-the-clear mySQLUser names of users who post to the Forum. Used to redact their names from their own posts. :type allUsersTable: String :param anonymize: If true, Forum post entries in the MySQL table will be anonymized :type anonymize: bool :param allow_anon_screen_name: if True, then occurrences of poster's name in post bodies are replaced by <redacName_<anon_screen_name>>, where anon_screen_name is the hash used in other tables of the OpenEdX data. :type allow_anon_screen_name: Bool ''' self.bsonFileName = bsonFileName self.forumTableName = forumTableName self.forumDbName = 'EdxForum' self.allUsersTableName = allUsersTableName self.anonymize = anonymize self.allowAnonScreenName = allowAnonScreenName # If not unittest, but regular run, then mysqlDbObj is None if mysqlDbObj is None: self.mysql_passwd = self.getMySQLPasswd() self.mysql_dbhost ='localhost' self.mysql_user = getpass.getuser() # mySQLUser that started this process self.mydb = MySQLDB(mySQLUser=self.mysql_user, passwd=self.mysql_passwd, db=self.forumDbName) else: self.mydb = mysqlDbObj self.counter=0 self.userCache = {} self.userSet = set() warnings.filterwarnings('ignore', category=MySQLdb.Warning) self.setupLogging() self.prepDatabase() #******mysqldb.commit(); #******logging.info('commit completed!') def runConversion(self): ''' Do the actual work. We don't call this method from __init__() so that unittests can create an EdxForumScrubber instance without doing the actual work. Instead, unittests call individual methods. ''' self.populateUserCache(); self.mongo_database_name = 'TmpForum' self.collection_name = 'contents' # Load bson file into Mongodb: self.loadForumIntoMongoDb(self.bsonFileName) self.mongodb = MongoDB(dbName=self.mongo_database_name, collection=self.collection_name) # Anonymize each forum record, and transfer to MySQL db: self.forumMongoToRelational(self.mongodb, self.mydb,'contents' ) self.mydb.close() self.mongodb.close() self.logInfo('Entered %d records into %s' % (self.counter, self.forumDbName + self.forumTableName)) def loadForumIntoMongoDb(self, bsonFilename): mongoclient = MongoClient(); db = mongoclient[self.mongo_database_name]; # Get collection object: collection = db[self.collection_name]; # Clear out any old forum entries: self.logInfo('Preparing to delete the collection ') collection.remove() self.logInfo('Deleting mongo collection completed. Will now attempt a mongo restore') self.logInfo('Spawning subprocess to execute mongo restore') with open(self.logFilePath,'w') as outfile: ret = subprocess.call( ['mongorestore', '--drop', '--db', self.mongo_database_name, '--collection', self.collection_name, bsonFilename], stdout=outfile, stderr=outfile) self.logDebug('Return value from mongorestore is %s' % (ret)) objCount = subprocess.check_output( ['mongo', '--quiet', '--eval', 'printjson(db.contents.count())', self.mongo_database_name, ], stderr=outfile) self.numMongoItems = objCount self.logInfo('Available Forum posts %s' % objCount) def forumMongoToRelational(self, mongodb, mysqlDbObj, mysqlTable): ''' Given a pymongo collection object in which Forum posts are stored, and a MySQL db object and table name, anonymize each mongo record, and insert it into the MySQL table. :param collection: collection object obtained via a mangoclient object :type collection: Collection :param mysqlDbObj: wrapper to MySQL db. See pymysql_utils.py :type mysqlDbObj: MYSQLDB :param mysqlTable: name of table where posts are to be deposited. Example: 'contents'. :type mysqlTable: String ''' #command = 'mongorestore %s -db %s -mongoForumRec %s'%(self.bson_filename,self.mongo_database_name,self.collection_name) #print command self.logInfo('Will start inserting from mongo collection to MySQL') for mongoForumRec in mongodb.query({}): mongoRecordObj = MongoRecord(mongoForumRec) try: # Check whether 'up' can be converted to a list list(mongoRecordObj['up']) except Exception as e: self.logInfo('Error in conversion' + `e`) mongoRecordObj['up'] ='-1' self.insert_content_record(mysqlDbObj, mysqlTable, mongoRecordObj); def prepDatabase(self): ''' Declare variables and execute statements preparing the database to configure options - e.g.: setting char set to utf, connection type to utf truncating the already existing table. ''' try: self.logDebug("Setting and assigning char set for mysqld. will truncate old values") self.mydb.execute('SET NAMES utf8;'); self.mydb.execute('SET CHARACTER SET utf8;'); self.mydb.execute('SET character_set_connection=utf8;'); # Compose fully qualified table name from the db name to # which self.mydb is connected, and the forum table name # that was established in __init__(): fullTblName = self.mydb.dbName() + '.' + self.forumTableName # Clear old forum data out of the table: try: self.mydb.dropTable(fullTblName) # Create MySQL table for the posts. If we are to # anonymize, the poster name column will be 'screen_name', # else it will be 'anon_screen_name': self.createForumTable(self.anonymize) self.logDebug("setting and assigning char set complete. Truncation succeeded") except ValueError as e: self.logDebug("Failed either to set character codes, or to create forum table %s: %s" % (fullTblName, `e`)) except MySQLdb.Error,e: self.logInfo("MySql Error exiting %d: %s" % (e.args[0],e.args[1])) # print e sys.exit(1)