Ejemplo n.º 1
0
    def __init__(self,
                 input_file,
                 ip_position,
                 output_file=None,
                 ignore_lines=0):
        '''
        Runs processing from beginning to end.
        
        @param input_file: path to CSV file containing the IP address column 
        @type input_file: str
        @param ip_position: zero-origin index of column that contains the IP address.
        @type ip_position: int
        @param output_file: optional output file for the result CSV
        @type output_file: {str | None|
        @param ignore_lines: number of lines to copy to destination unprocessed. Used for headers.
        @type ignore_lines: {int}
        '''

        if not os.access(input_file, os.R_OK):
            print("Input file %s not found or not readable." % input_file)
            sys.exit(1)

        self.country_dict = IpCountryDict()

        # Get lines to pass through, and array of row-arrays:
        (headers, rows) = self.read_csvlines(input_file, ignore_lines)
        self.write_country_state_city(headers, rows, ip_position, output_file)
Ejemplo n.º 2
0
 def __init__(self, user, pwd):
     self.ipCountryXlater = IpCountryDict()
     self.user = user
     self.pwd = pwd
     self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
     self.db.dropTable(UserCountryTableCreator.DEST_TABLE)
     self.db.createTable(
         UserCountryTableCreator.DEST_TABLE,
         OrderedDict({
             'anon_screen_name': 'varchar(40) NOT NULL DEFAULT ""',
             'two_letter_country': 'varchar(2) NOT NULL DEFAULT ""',
             'three_letter_country': 'varchar(3) NOT NULL DEFAULT ""',
             'country': 'varchar(255) NOT NULL DEFAULT ""'
         }))
class UserCountryTableCreator(object):

    DEST_TABLE = 'UserCountry'
    
    def __init__(self, user, pwd):
        self.ipCountryXlater = IpCountryDict()
        self.user = user
        self.pwd  = pwd
        self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
        self.db.dropTable(UserCountryTableCreator.DEST_TABLE)
        self.db.createTable(UserCountryTableCreator.DEST_TABLE, 
                                           OrderedDict({'anon_screen_name' : 'varchar(40) NOT NULL DEFAULT ""',
                                            'two_letter_country' : 'varchar(2) NOT NULL DEFAULT ""',
                                            'three_letter_country' : 'varchar(3) NOT NULL DEFAULT ""',
                                            'country' : 'varchar(255) NOT NULL DEFAULT ""'}))
        
    def fillTable(self):
        values = []
        for (user, ip3LetterCountry) in self.db.query("SELECT DISTINCT anon_screen_name, ip_country FROM EventXtract"):
            try:
                (twoLetterCode, threeLetterCode, country) = self.ipCountryXlater.getBy3LetterCode(ip3LetterCountry)
            except (ValueError,TypeError,KeyError) as e:
                sys.stderr.write("Could not look up one country from (%s/%s): %s\n" % (user, ip3LetterCountry,`e`))
                continue
            values.append(tuple(['%s'%user,'%s'%twoLetterCode,'%s'%threeLetterCode,'%s'%country]))
        
        colNameTuple = ('anon_screen_name','two_letter_country','three_letter_country','country')
        self.db.bulkInsert(UserCountryTableCreator.DEST_TABLE, colNameTuple, values)

    def makeIndex(self):
        self.db.execute("CALL createIndexIfNotExists('UserCountryAnonIdx', 'UserCountry', 'anon_screen_name', 40);")
        self.db.execute("CALL createIndexIfNotExists('UserCountryThreeLetIdx', 'UserCountry', 'three_letter_country', 3);")

    def close(self):
        self.db.close()
 def __init__(self, user, pwd):
     self.ipCountryXlater = IpCountryDict()
     self.user = user
     self.pwd = pwd
     self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
     # Make sure table exists. It should, and it should be filled
     # with all anon_screen_name and countries up the previous
     # load:
     self.db.createTable(
         UserCountryTableCreator.DEST_TABLE,
         OrderedDict({
             'anon_screen_name': 'varchar(40) NOT NULL DEFAULT ""',
             'two_letter_country': 'varchar(2) NOT NULL DEFAULT ""',
             'three_letter_country': 'varchar(3) NOT NULL DEFAULT ""',
             'country': 'varchar(255) NOT NULL DEFAULT ""'
         }))
 def __init__(self, user, pwd):
     self.ipCountryXlater = IpCountryDict()
     self.user = user
     self.pwd = pwd
     self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
     # Make sure table exists. It should, and it should be filled
     # with all anon_screen_name and countries up the previous
     # load:
     createCmd = '''CREATE TABLE UserCountry (
                      anon_screen_name varchar(40) NOT NULL DEFAULT "",
                      two_letter_country varchar(2) NOT NULL DEFAULT "",
                      three_letter_country varchar(3) NOT NULL DEFAULT "",
                      country varchar(255) NOT NULL DEFAULT ""
                      ) ENGINE=MyISAM;
                      '''
     self.db.dropTable('UserCountry')
     print("Creating table UserCountry...")
     self.db.execute(createCmd)
     print("Done creating table UserCountry.")
class UserCountryTableCreator(object):

    DEST_TABLE = 'UserCountry'

    def __init__(self, user, pwd):
        self.ipCountryXlater = IpCountryDict()
        self.user = user
        self.pwd = pwd
        self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
        self.db.dropTable(UserCountryTableCreator.DEST_TABLE)
        self.db.createTable(
            UserCountryTableCreator.DEST_TABLE,
            OrderedDict({
                'anon_screen_name': 'varchar(40) NOT NULL DEFAULT ""',
                'two_letter_country': 'varchar(2) NOT NULL DEFAULT ""',
                'three_letter_country': 'varchar(3) NOT NULL DEFAULT ""',
                'country': 'varchar(255) NOT NULL DEFAULT ""'
            }))

    def fillTable(self):
        values = []
        for (user, ip3LetterCountry) in self.db.query(
                "SELECT DISTINCT anon_screen_name, ip_country FROM EventXtract"
        ):
            try:
                (twoLetterCode, threeLetterCode, country
                 ) = self.ipCountryXlater.getBy3LetterCode(ip3LetterCountry)
            except (ValueError, TypeError, KeyError) as e:
                sys.stderr.write(
                    "Could not look up one country from (%s/%s): %s\n" %
                    (user, ip3LetterCountry, ` e `))
                continue
            values.append(
                tuple([
                    '%s' % user,
                    '%s' % twoLetterCode,
                    '%s' % threeLetterCode,
                    '%s' % country
                ]))

        colNameTuple = ('anon_screen_name', 'two_letter_country',
                        'three_letter_country', 'country')
        self.db.bulkInsert(UserCountryTableCreator.DEST_TABLE, colNameTuple,
                           values)

    def makeIndex(self):
        self.db.execute(
            "CALL createIndexIfNotExists('UserCountryAnonIdx', 'UserCountry', 'anon_screen_name', 40);"
        )
        self.db.execute(
            "CALL createIndexIfNotExists('UserCountryThreeLetIdx', 'UserCountry', 'three_letter_country', 3);"
        )

    def close(self):
        self.db.close()
Ejemplo n.º 7
0
    def __init__(self):
        '''
        Initializes extractor object with credentials from .ssh directory.
        Set log file directory.
        '''
        home = expanduser("~")
        userFile = home + '/.ssh/qualtrics_user'
        tokenFile = home + '/.ssh/qualtrics_token'
        dbFile = home + "/.ssh/mysql_user"
        if os.path.isfile(userFile) == False:
            sys.exit("User file not found: " + userFile)
        if os.path.isfile(tokenFile) == False:
            sys.exit("Token file not found: " + tokenFile)
        if os.path.isfile(dbFile) == False:
            sys.exit("MySQL user credentials not found: " + dbFile)

        self.apiuser = None
        self.apitoken = None
        dbuser = None  #@UnusedVariable
        dbpass = None  #@UnusedVariable

        with open(userFile, 'r') as f:
            self.apiuser = f.readline().rstrip()

        with open(tokenFile, 'r') as f:
            self.apitoken = f.readline().rstrip()

        with open(dbFile, 'r') as f:
            dbuser = f.readline().rstrip()
            dbpass = f.readline().rstrip()

        logging.basicConfig(
            filename="EdxQualtricsETL_%d%d%d_%d%d.log" %
            (dt.datetime.today().year, dt.datetime.today().month,
             dt.datetime.today().day, dt.datetime.now().hour,
             dt.datetime.now().minute),
            level=logging.INFO)

        self.lookup = IpCountryDict()

        #************
        MySQLDB.__init__(self, db="EdxQualtrics", user=dbuser, passwd=dbpass)
 def __init__(self, user, pwd):
     self.ipCountryXlater = IpCountryDict()
     self.user = user
     self.pwd  = pwd
     self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
     self.db.dropTable(UserCountryTableCreator.DEST_TABLE)
     self.db.createTable(UserCountryTableCreator.DEST_TABLE, 
                                        OrderedDict({'anon_screen_name' : 'varchar(40) NOT NULL DEFAULT ""',
                                         'two_letter_country' : 'varchar(2) NOT NULL DEFAULT ""',
                                         'three_letter_country' : 'varchar(3) NOT NULL DEFAULT ""',
                                         'country' : 'varchar(255) NOT NULL DEFAULT ""'}))
    def __init__(self, ip_list_file):

        countryDict = IpCountryDict()
        with open(ip_list_file, 'r') as fd:
            for ip in fd:
                ip = ip.strip()
                if len(ip) == 0:
                    print('%s\t%s' % ('<empty>', 'n/a'))
                    continue

                if ip == '127.0.0.1':
                    print('%s\t%s\t' % (ip, 'localhost'))
                    continue
                # If there are multiple, comma-separated ips, do them all:
                ips = ip.split(',')
                try:
                    for the_ip in ips:
                        country = countryDict.get(the_ip.strip(), 'n/a')
                        sys.stdout.write('%s\t%s' % (the_ip, country))
                except ValueError as e:
                    print('%s\t%s' % (ip, 'n/a'))
                    continue
                sys.stdout.write('\n')
 def __init__(self, ip_list_file):
     
     countryDict = IpCountryDict()
     with open(ip_list_file, 'r') as fd:
         for ip in fd:
             ip = ip.strip()
             if len(ip) == 0:
                 print('%s\t%s' % ('<empty>','n/a'))
                 continue
              
             if ip == '127.0.0.1':
                 print('%s\t%s\t' % (ip,'localhost'))
                 continue
             # If there are multiple, comma-separated ips, do them all:
             ips = ip.split(',')
             try:
                 for the_ip in ips:
                     country = countryDict.get(the_ip.strip(), 'n/a')
                     sys.stdout.write('%s\t%s' % (the_ip,country))
             except ValueError as e:
                 print('%s\t%s' % (ip,'n/a'))
                 continue
             sys.stdout.write('\n')
Ejemplo n.º 11
0
 def __init__(self, user, pwd):
     self.ipCountryXlater = IpCountryDict()
     self.user = user
     self.pwd = pwd
     self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
     # Make sure table exists. It should, and it should be filled
     # with all anon_screen_name and countries up the previous
     # load:
     createCmd = '''CREATE TABLE %s (
                      anon_screen_name varchar(40) NOT NULL DEFAULT "",
                      two_letter_country varchar(2) NOT NULL DEFAULT "",
                      three_letter_country varchar(3) NOT NULL DEFAULT "",
                      country varchar(255) NOT NULL DEFAULT "",
                      region varchar(255) NOT NULL DEFAULT "",
                      city varchar(255) NOT NULL DEFAULT "",
                      lat_long point NOT NULL
                      ) ENGINE=MyISAM;
                      ''' % UserDetailedLocationTableCreator.DEST_TABLE
     self.db.dropTable('UserCountry')
     print("Creating table %..." %
           UserDetailedLocationTableCreator.DEST_TABLE)
     self.db.execute(createCmd)
     print("Done creating table %s." %
           UserDetailedLocationTableCreator.DEST_TABLE)
Ejemplo n.º 12
0
    def __init__(self):
        '''
        Initializes extractor object with credentials from .ssh directory.
        Set log file directory.
        '''
        home = expanduser("~")
        userFile = home + '/.ssh/qualtrics_user'
        tokenFile = home + '/.ssh/qualtrics_token'
        dbFile = home + "/.ssh/mysql_user"
        if os.path.isfile(userFile) == False:
            sys.exit("User file not found: " + userFile)
        if os.path.isfile(tokenFile) == False:
            sys.exit("Token file not found: " + tokenFile)
        if os.path.isfile(dbFile) == False:
            sys.exit("MySQL user credentials not found: " + dbFile)

        self.apiuser = None
        self.apitoken = None
        dbuser = None
        dbpass = None

        with open(userFile, 'r') as f:
            self.apiuser = f.readline().rstrip()

        with open(tokenFile, 'r') as f:
            self.apitoken = f.readline().rstrip()

        with open(dbFile, 'r') as f:
            dbuser = f.readline().rstrip()
            dbpass = f.readline().rstrip()

        logging.basicConfig(filename="EdxQualtricsETL_%d%d%d_%d%d.log" % (dt.datetime.today().year, dt.datetime.today().month, dt.datetime.today().day, dt.datetime.now().hour, dt.datetime.now().minute),
                            level=logging.INFO)

        self.lookup = IpCountryDict()

        MySQLDB.__init__(self, db="EdxQualtrics", user=dbuser, passwd=dbpass)
Ejemplo n.º 13
0
class IpBatchLookup(object):
    '''
    Given a CSV file in which one column contains IP addresses,
    append a column at the end that contains the country of that
    IP address. The resulting CSV is written to standard out. 
    
    An arbitrary number of lines may be transferred to the destination
    without processing: used for headers. The string 'country' is 
    added to each of these rows.
    '''
    def __init__(self,
                 input_file,
                 ip_position,
                 output_file=None,
                 ignore_lines=0):
        '''
        Runs processing from beginning to end.
        
        @param input_file: path to CSV file containing the IP address column 
        @type input_file: str
        @param ip_position: zero-origin index of column that contains the IP address.
        @type ip_position: int
        @param output_file: optional output file for the result CSV
        @type output_file: {str | None|
        @param ignore_lines: number of lines to copy to destination unprocessed. Used for headers.
        @type ignore_lines: {int}
        '''

        if not os.access(input_file, os.R_OK):
            print("Input file %s not found or not readable." % input_file)
            sys.exit(1)

        self.country_dict = IpCountryDict()

        # Get lines to pass through, and array of row-arrays:
        (headers, rows) = self.read_csvlines(input_file, ignore_lines)
        self.write_country_state_city(headers, rows, ip_position, output_file)

    def read_csvlines(self, input_file, ignore_lines):
        '''
        Read all of the CSV file into memory, and return
        a tuple: the pass-through rows with 'country' appended
        to each, and the array of rows. Returns a 2-tuple: an
        array of array with pass-through lines, and an array of array
        of rows.
        
        @param input_file: path to file with CSV input
        @type input_file: str
        @param ignore_lines: how many lines to pass through
        @type ignore_lines: int
        @return: a possibly empty array of rows to pass through ('country' attached)
        @rtype: ([[str]], [[str]])
        '''
        headers = []
        with open(input_file, 'r') as csvfile:
            csv_reader = reader(csvfile)
            try:
                # Get each header line, add 'country',
                # and add to the 'headers' result:
                for _ in range(ignore_lines):
                    row = csv_reader.next()
                    headers.append(row + ['country'])
            except StopIteration:
                # Fewer lines in file than supposed
                # header lines:
                return []
            # Read the row-arrays, enclosing them
            # all as an array: [[col1,1, col1,2], [col2,1, col2,2]]:
            rows = [row for row in csv_reader]
            return (headers, rows)

    def write_country_state_city(self, header_rows, rows, ip_position,
                                 output_file):
        '''
        The actual lookup:
        
        @param header_rows: possibly empty row of header lines
        @type header_rows: [[str]]
        @param rows: rows containing IP addresses to resolve to countries
        @type rows: [[str]]
        @param ip_position: zero-origin index to IP-containing column in each rows.
        @type ip_position: int
        @param output_file: output file; if None, write to stdout.
        @type output_file: {str | None}
        '''

        # Ensure the_output_file variable holds
        # an open file descriptor:

        if output_file is None:
            the_output_file = sys.stdout
        elif type(output_file) == str:
            the_output_file = open(output_file, 'w')
        else:
            the_output_file = output_file

        # Ignore header rows: just copy them to the output:
        for header_row in header_rows:
            the_output_file.write(','.join(header_row) + '\n')

        # The actual work:
        for row in rows:
            try:
                (twoLetter, threeLetter, country) = self.country_dict.lookupIP(
                    row[ip_position])  #@UnusedVariable
            except ValueError:
                country = 'Bad ip %s' % row[ip_position]
            row.append(country)
            the_output_file.write(','.join(row) + '\n')

        # If output_file parameter was a string, we
        # opened a file:
        if type(output_file) == str:
            the_output_file.close()
Ejemplo n.º 14
0
 def setUp(self):
     super(IpToCountryTester, self).setUp()
     IpToCountryTester.lookup = IpCountryDict()
class UserCountryTableCreator(object):

    DEST_TABLE = 'UserCountry'
    # Number of anon ids-country-2-letter-3-letter
    # tuples to accumulate before inserting into
    # UserCountry:
    INSERT_BULK_SIZE = 15000

    def __init__(self, user, pwd):
        self.ipCountryXlater = IpCountryDict()
        self.user = user
        self.pwd = pwd
        self.db = MySQLDB(user=self.user, passwd=self.pwd, db='Edx')
        # Make sure table exists. It should, and it should be filled
        # with all anon_screen_name and countries up the previous
        # load:
        createCmd = '''CREATE TABLE UserCountry (
                         anon_screen_name varchar(40) NOT NULL DEFAULT "",
                         two_letter_country varchar(2) NOT NULL DEFAULT "",
                         three_letter_country varchar(3) NOT NULL DEFAULT "",
                         country varchar(255) NOT NULL DEFAULT ""
                         ) ENGINE=MyISAM;
                         '''
        self.db.dropTable('UserCountry')
        print("Creating table UserCountry...")
        self.db.execute(createCmd)
        print("Done creating table UserCountry.")

    def fillTable(self):
        query = "SELECT DISTINCT anon_screen_name, ip_country FROM EventXtract"
        query_res_it = self.db.query(query)
        done = False
        # Order of columns for insert:
        colNameTuple = ('anon_screen_name', 'two_letter_country',
                        'three_letter_country', 'country')

        while not done:
            values = []
            print("%s: Starting one set of %s lookups..." %\
                  (str(datetime.datetime.today()),
                   UserCountryTableCreator.INSERT_BULK_SIZE))
            for _ in range(UserCountryTableCreator.INSERT_BULK_SIZE):
                try:
                    (anon_screen_name, ip3LetterCountry) = query_res_it.next()
                except StopIteration:
                    done = True
                    break
                # Try translating:
                try:
                    (twoLetterCode, threeLetterCode,
                     country) = self.ipCountryXlater.getBy3LetterCode(
                         ip3LetterCountry)
                except (ValueError, TypeError, KeyError):
                    twoLetterCode = 'XX'
                    threeLetterCode = 'XXX'
                    country = 'Not in lookup tbl'
                    #sys.stderr.write("Could not look up one country from (%s/%s): %s\n" % (user, ip3LetterCountry,`e`))
                values.append(
                    tuple([
                        '%s' % anon_screen_name,
                        '%s' % twoLetterCode,
                        '%s' % threeLetterCode,
                        '%s' % country
                    ]))

            # Insert this chunk into the UserCountry table
            print("%s: Inserting %s rows into UserCountry table..." %
                  (str(datetime.datetime.today()), len(values)))
            (errors,
             warnings) = self.db.bulkInsert(UserCountryTableCreator.DEST_TABLE,
                                            colNameTuple, values)
            if errors is not None:
                print('Error(s) during UserCountry insert: %s' % errors)
                sys.exit(1)
            if warnings is not None:
                print('Warning(s) during UserCountry insert: %s' % warnings)

            print("%s: Done inserting %s rows into UserCountry table..." %
                  (str(datetime.datetime.today()), len(values)))
            # ... and loop to process the next INSERT_BULK_SIZE batch

    def makeIndex(self):
        self.db.execute(
            "CALL createIndexIfNotExists('UserCountryAnonIdx', 'UserCountry', 'anon_screen_name', 40);"
        )
        self.db.execute(
            "CALL createIndexIfNotExists('UserCountryThreeLetIdx', 'UserCountry', 'three_letter_country', 3);"
        )

    def close(self):
        self.db.close()
Ejemplo n.º 16
0
class QualtricsExtractor(MySQLDB):

    def __init__(self):
        '''
        Initializes extractor object with credentials from .ssh directory.
        Set log file directory.
        '''
        home = expanduser("~")
        userFile = home + '/.ssh/qualtrics_user'
        tokenFile = home + '/.ssh/qualtrics_token'
        dbFile = home + "/.ssh/mysql_user"
        if os.path.isfile(userFile) == False:
            sys.exit("User file not found: " + userFile)
        if os.path.isfile(tokenFile) == False:
            sys.exit("Token file not found: " + tokenFile)
        if os.path.isfile(dbFile) == False:
            sys.exit("MySQL user credentials not found: " + dbFile)

        self.apiuser = None
        self.apitoken = None
        dbuser = None
        dbpass = None

        with open(userFile, 'r') as f:
            self.apiuser = f.readline().rstrip()

        with open(tokenFile, 'r') as f:
            self.apitoken = f.readline().rstrip()

        with open(dbFile, 'r') as f:
            dbuser = f.readline().rstrip()
            dbpass = f.readline().rstrip()

        logging.basicConfig(filename="EdxQualtricsETL_%d%d%d_%d%d.log" % (dt.datetime.today().year, dt.datetime.today().month, dt.datetime.today().day, dt.datetime.now().hour, dt.datetime.now().minute),
                            level=logging.INFO)

        self.lookup = IpCountryDict()

        MySQLDB.__init__(self, db="EdxQualtrics", user=dbuser, passwd=dbpass)

## Database setup helper methods for client

    def resetSurveys(self):
        self.execute("DROP TABLE IF EXISTS `choice`;")
        self.execute("DROP TABLE IF EXISTS `question`;")

        choiceTbl = (       """
                            CREATE TABLE IF NOT EXISTS `choice` (
                              `SurveyId` varchar(50) DEFAULT NULL,
                              `QuestionId` varchar(50) DEFAULT NULL,
                              `ChoiceId` varchar(50) DEFAULT NULL,
                              `description` varchar(3000) DEFAULT NULL
                            ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                            """ )

        questionTbl = (     """
                            CREATE TABLE IF NOT EXISTS `question` (
                              `SurveyID` varchar(50) DEFAULT NULL,
                              `QuestionID` varchar(5000) DEFAULT NULL,
                              `QuestionDescription` varchar(5000) DEFAULT NULL,
                              `ForceResponse` varchar(50) DEFAULT NULL,
                              `QuestionType` varchar(50) DEFAULT NULL,
                              `QuestionNumber` varchar(50) DEFAULT NULL
                            ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                            """ )

        self.execute(choiceTbl)
        self.execute(questionTbl)

    def resetResponses(self):
        self.execute("DROP TABLE IF EXISTS `response`;")
        self.execute("DROP TABLE IF EXISTS `response_metadata`;")

        responseTbl = (     """
                            CREATE TABLE IF NOT EXISTS `response` (
                              `SurveyId` varchar(50) DEFAULT NULL,
                              `ResponseId` varchar(50) DEFAULT NULL,
                              `QuestionNumber` varchar(50) DEFAULT NULL,
                              `AnswerChoiceId` varchar(500) DEFAULT NULL,
                              `Description` varchar(5000) DEFAULT NULL
                            ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                            """ )

        responseMetaTbl = ( """
                            CREATE TABLE IF NOT EXISTS `response_metadata` (
                              `SurveyID` varchar(50) DEFAULT NULL,
                              `ResponseID` varchar(50) DEFAULT NULL,
                              `Name` varchar(1200) DEFAULT NULL,
                              `EmailAddress` varchar(50) DEFAULT NULL,
                              `IPAddress` varchar(50) DEFAULT NULL,
                              `StartDate` datetime DEFAULT NULL,
                              `EndDate` datetime DEFAULT NULL,
                              `ResponseSet` varchar(50) DEFAULT NULL,
                              `ExternalDataReference` varchar(200) DEFAULT NULL,
                              `a` varchar(200) DEFAULT NULL,
                              `UID` varchar(200) DEFAULT NULL,
                              `userid` varchar(200) DEFAULT NULL,
                              `anon_screen_name` varchar(200) DEFAULT NULL,
                              `advance` varchar(200) DEFAULT NULL,
                              `Country` varchar(50) DEFAULT NULL,
                              `Finished` varchar(50) DEFAULT NULL,
                              `Status` varchar (200) DEFAULT NULL
                            ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                            """ )

        self.execute(responseTbl)
        self.execute(responseMetaTbl)

    def resetMetadata(self):
        self.execute("DROP TABLE IF EXISTS `survey_meta`;")

        surveyMeta = (      """
                            CREATE TABLE IF NOT EXISTS `survey_meta` (
                              `SurveyId` varchar(50) DEFAULT NULL,
                              `PodioID` varchar(50) DEFAULT NULL,
                              `SurveyCreationDate` datetime DEFAULT NULL,
                              `UserFirstName` varchar(200) DEFAULT NULL,
                              `UserLastName` varchar(200) DEFAULT NULL,
                              `SurveyName` varchar(2000) DEFAULT NULL,
                              `responses` varchar(50) DEFAULT NULL,
                              `responses_actual` int DEFAULT NULL
                            ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                            """ )

        self.execute(surveyMeta)

## API extractor methods

    def __getSurveyMetadata(self):
        '''
        Pull survey metadata from Qualtrics API v2.4. Returns JSON object.
        '''
        url = "https://stanforduniversity.qualtrics.com/WRAPI/ControlPanel/api.php?API_SELECT=ControlPanel&Version=2.4&Request=getSurveys&User=%s&Token=%s&Format=JSON&JSONPrettyPrint=1" % (self.apiuser, self.apitoken)
        data = json.loads(urllib2.urlopen(url).read())
        return data

    def __genSurveyIDs(self, forceLoad=False):
        '''
        Generator for Qualtrics survey IDs. Generates only IDs for surveys with
        new data to pull from Qualtrics unless user specifies that load should be
        forced.
        '''
        data = self.__getSurveyMetadata()
        surveys = data['Result']['Surveys']
        total = len(surveys)
        logging.info("Extracting %d surveys from Qualtrics..." % total)

        for idx, sv in enumerate(surveys):
            svID = sv['SurveyID']
            logging.info("Processing survey %d out of %d total: %s" % (idx+1, total, svID))
            if (forceLoad==True):
                yield svID
                continue

            payload = int(sv.pop('responses', 0))
            logging.info(" Found %d responses." % payload)
            existing = (self.__numResponses(svID) or 0)
            logging.info(" Have %d responses already." % existing)
            if (existing < payload) or (forceLoad == True):
                yield svID
            else:
                logging.info("  Survey %s yielded no new data." % svID)
                continue

    def __getSurvey(self, surveyID):
        '''
        Pull survey data for given surveyID from Qualtrics API v2.4. Returns XML string.
        '''
        url="https://stanforduniversity.qualtrics.com//WRAPI/ControlPanel/api.php?API_SELECT=ControlPanel&Version=2.4&Request=getSurvey&User=%s&Token=%s&SurveyID=%s" % (self.apiuser, self.apitoken, surveyID)
        data = urllib2.urlopen(url).read()
        return ET.fromstring(data)

    def __getResponses(self, surveyID):
        '''
        Pull response data for given surveyID from Qualtrics. Method generates
        JSON objects containing batches of 5000 surveys.
        '''

        urlTemp = Template("https://dc-viawest.qualtrics.com:443/API/v1/surveys/${svid}/responseExports?apiToken=${tk}&fileType=JSON")
        reqURL = urlTemp.substitute(svid=surveyID, tk=self.apitoken)
        req = json.loads(urllib2.urlopen(reqURL).read())

        statURL = req['result']['exportStatus'] + "?apiToken=" + self.apitoken
        percent, tries = 0, 0
        while percent != 100 and tries < 20:
            time.sleep(5) # Wait 5 seconds between attempts to acquire data
            try:
                stat = json.loads(urllib2.urlopen(statURL).read())
                percent = stat['result']['percentComplete']
            except:
                logging.warning(" Recovered from HTTP error.")
                continue
            finally:
                tries += 1
        if tries >= 20:
            logging.error("  Survey %s timed out." % surveyID)
            return None

        dataURL = stat['result']['fileUrl']
        remote = urllib2.urlopen(dataURL).read()
        dataZip = sio.StringIO(remote)
        archive = z.ZipFile(dataZip, 'r')
        dataFile = archive.namelist()[0]
        data = json.loads(archive.read(dataFile), object_pairs_hook=OrderedDict)

        if not data['responses']:
            return None
        else:
            return data


## Helper methods for interfacing with DB

    def __assignPodioID(self, survey, surveyID):
        '''
        Given a survey from Qualtrics, finds embedded field 'c' and returns
        field value. For mapping surveys to course names via Podio project IDs.
        '''
        try:
            podioID = "NULL"
            embeddedFields = survey.findall('./EmbeddedData/Field')
            for ef in embeddedFields:
                if ef.find('Name').text == 'c':
                    podioID = ef.find('Value').text
        except AttributeError as e:
            logging.warning("%s podioID getter failed with error: %s" % (surveyID, e))

        # Update DB with retrieved Podio ID
        query = "UPDATE survey_meta SET PodioID='%s' WHERE SurveyId='%s'" % (podioID, surveyID)
        self.execute(query.encode('UTF-8', 'ignore'))

    def __isLoaded(self, svID):
        '''
        Checks survey_meta table for given surveyID. Returns 1 if loaded, 0 otherwise.
        '''
        return self.query("SELECT count(*) FROM survey_meta WHERE SurveyID='%s'" % svID).next()[0]

    def __numResponses(self, svID):
        '''
        Given a survey ID, fetches number of responses loaded from survey_meta table.
        '''
        return self.query("SELECT responses_actual FROM survey_meta WHERE SurveyID='%s'" % svID).next()[0]

    def __getAnonUserID(self, uid):
        '''
        Given a userID from Qualtrics, returns translated anon user ID from platform data.
        '''
        q = "SELECT edxprod.idExt2Anon('%s')" % uid
        return self.query(q).next()[0]


## Transform methods

    def __parseSurveyMetadata(self, rawMeta):
        '''
        Given survey metadata for active user, returns a dict of dicts mapping
        column names to values for each survey. Skips over previously loaded surveys.
        '''
        svMeta = []
        for sv in rawMeta:
            keys = ['SurveyID', 'SurveyName', 'SurveyCreationDate', 'UserFirstName', 'UserLastName', 'responses']
            data = dict()
            svID = sv['SurveyID']
            if self.__isLoaded(svID):
                continue
            for key in keys:
                try:
                    val = sv[key].replace('"', '')
                    data[key] = val
                except KeyError as k:
                    data[k[0]] = 'NULL' # Set value to NULL if no data found
            svMeta.append(data) # Finally, add row to master dict
        return svMeta

    def __parseSurvey(self, svID):
        '''
        Given surveyID, parses survey from Qualtrics and returns:
         1. a dict mapping db column names to values corresponding to survey questions
         2. a dict of dicts mapping db column names to choices for each question
        Method expects an XML ElementTree object corresponding to a single survey.
        '''
        # Get survey from surveyID
        sv=None
        try:
            sv = self.__getSurvey(svID)
        except urllib2.HTTPError:
            logging.warning("Survey %s not found." % svID)
            return None, None

        masterQ = dict()
        masterC = dict()

        # Handle PodioID mapping in survey_meta table
        self.__assignPodioID(sv, svID)

        # Parse data for each question
        questions = sv.findall('./Questions/Question')
        for idx, q in enumerate(questions):
            parsedQ = dict()
            qID = q.attrib['QuestionID']
            parsedQ['SurveyID'] = svID
            parsedQ['QuestionID'] = qID
            parsedQ['QuestionNumber'] = q.find('ExportTag').text
            parsedQ['QuestionType'] = q.find('Type').text
            try:
                parsedQ['ForceResponse'] = q.find('Validation/ForceResponse').text
            except:
                parsedQ['ForceResponse'] = 'NULL'
            try:
                text = q.find('QuestionDescription').text.replace('"', '')
                if len(text) > 2000:
                    text = text[0:2000]
                parsedQ['QuestionDescription'] = text
            except:
                parsedQ['QuestionDescription'] = 'NULL'

            masterQ[idx] = parsedQ

            # For each question, load all choices
            choices = q.findall('Choices/Choice')
            for c in choices:
                parsedC = dict()
                cID = c.attrib['ID']
                parsedC['SurveyID'] = svID
                parsedC['QuestionID'] = qID
                parsedC['ChoiceID'] = cID
                cdesc = c.find('Description').text
                parsedC['Description'] = cdesc.replace("'", "").replace('"', '') if (cdesc is not None) else 'N/A'
                masterC[qID+cID] = parsedC

        return masterQ, masterC

    def __parseResponses(self, svID):
        '''
        Given a survey ID, parses responses from Qualtrics and returns:
        1. A list of dicts containing response metadata
        2. A list of dicts containing question responses
        Method expects a JSON formatted object with raw survey data.
        '''
        # Get responses from Qualtrics-- try multiple times to ensure API request goes through
        rsRaw = None
        for x in range(0,10):
            try:
                rsRaw = self.__getResponses(svID)
                break
            except urllib2.HTTPError as e:
                logging.error("  Survey %s gave error '%s'." % (svID, e))
                if e.getcode() == '400':
                    continue
                else:
                    return None, None

        # Return if API gave us no data
        if rsRaw == None:
            logging.info("  Survey %s gave no responses." % svID)
            return None, None

        # Get total expected responses
        rq = 'SELECT `responses` FROM survey_meta WHERE SurveyID = "%s"' % svID
        rnum = self.query(rq).next()

        logging.info(" Parsing %s responses from survey %s..." % (len(rsRaw['responses']), svID))

        responses = []
        respMeta = []
        rsID = None

        for rs in rsRaw['responses']:
            rsID = rs.pop('ResponseID', 'NULL')
            # Get response metadata for each response
            # Method destructively reads question fields
            rm = dict()
            rm['SurveyID'] = svID
            rm['ResponseID'] = rsID
            rm['Name'] = rs.pop('RecipientFirstName', 'NULL') + rs.pop('RecipientLastName', 'NULL')
            rm['EmailAddress'] = rs.pop('RecipientEmail', 'NULL')
            rm['IPAddress'] = rs.pop('IPAddress', 'NULL')
            rm['StartDate'] = rs.pop('StartDate', 'NULL')
            rm['EndDate'] = rs.pop('EndDate', 'NULL')
            rm['ResponseSet'] = rs.pop('ResponseSet', 'NULL')
            rm['ExternalDataReference'] = rs.pop('ExternalDataReference', 'NULL')
            rm['a'] = rs.pop('a', 'NULL')
            rm['UID'] = rs.pop('uid', 'NULL')
            rm['userid'] = rs.pop('user_id', 'NULL') #NOTE: Not transformed, use unclear
            if(len(rm['UID']) >= 40):
                rm['anon_screen_name'] = rm['UID']
            elif (len(rm['a']) >= 32):
                rm['anon_screen_name'] = self.__getAnonUserID(rm['a'])
            if (len(rm['IPAddress']) in range(8,16)):
                rm['Country'] = self.lookup.lookupIP(rm['IPAddress'])[1]
            elif (len(rm['UID']) in range(8,16)):
                rm['Country'] = self.lookup.lookupIP(rm['UID'])[1]
            rm['advance'] = rs.pop('advance', 'NULL')
            rm['Finished'] = rs.pop('Finished', 'NULL')
            rm['Status'] = rs.pop('Status', 'NULL')
            del rs['LocationLatitude']
            del rs['LocationLongitude']
            respMeta.append(rm)

            # Parse remaining fields as question answers
            fields = rs.keys()
            for q in fields:
                qs = dict()
                if 'Q' and '_' in q:
                    qSplit = q.split('_')
                    qNum = qSplit[0]
                    cID = qSplit[1]
                else:
                    qNum = q
                    cID = 'NULL'
                qs['SurveyID'] = svID
                qs['ResponseID'] = rsID
                qs['QuestionNumber'] = qNum
                qs['AnswerChoiceID'] = cID
                desc = rs[q].replace('"', '').replace("'", "").replace('\\', '').lstrip('u')
                if len(desc) >= 5000:
                    desc = desc[:5000] #trim past max field length
                qs['Description'] = desc
                responses.append(qs)

        return responses, respMeta


## Convenience method for handling query calls to MySQL DB.

    def __loadDB(self, data, tableName):
        '''
        Convenience function for writing data to named table. Expects data to be
        represented as a list of dicts mapping column names to values.
        '''
        try:
            columns = tuple(data[20].keys())
            table = []
            # logging.info("     " + ", ".join(columns))
            for row in data:
                vals = tuple(row.values())
                # logging.info("     " + ", ".join(vals))
                table.append(vals)
            self.bulkInsert(tableName, columns, table)
        except Exception as e:
            logging.error("  Insert query failed: %s" % e)


## Client data load methods

    def loadSurveyMetadata(self):
        '''
        Client method extracts and transforms survey metadata and loads to MySQL
        database using query interface inherited from MySQLDB class.
        '''
        rawMeta = self.__getSurveyMetadata()
        svMeta = rawMeta['Result']['Surveys']
        parsedSM = self.__parseSurveyMetadata(svMeta)
        if len(parsedSM) > 0:
            self.__loadDB(parsedSM, 'survey_meta')

    def loadSurveyData(self):
        '''
        Client method extracts and transforms survey questions and question
        choices and loads to MySQL database using MySQLDB class methods.
        '''
        sids = self.__genSurveyIDs(forceLoad=True)
        for svID in sids:
            questions, choices = self.__parseSurvey(svID)
            if (questions == None) and (choices == None):
                continue
            self.__loadDB(questions.values(), 'question')
            self.__loadDB(choices.values(), 'choice')

    def loadResponseData(self, startAfter=0):
        '''
        Client method extracts and transforms response data and response metadata
        and loads to MySQL database using MySQLDB class methods. User can specify
        where to start in the list of surveyIDs.
        '''
        sids = self.__genSurveyIDs()
        for idx, svID in enumerate(sids):
            if idx < startAfter:
                logging.info("  Skipped surveyID %s" % svID)
                continue # skip first n surveys
            responses, respMeta = self.__parseResponses(svID)
            retrieved = len(respMeta) if respMeta is not None else 0
            logging.info(" Inserting %d responses on survey %s to database." % (retrieved, svID))
            self.execute("UPDATE survey_meta SET responses_actual='%d' WHERE SurveyID='%s'" % (retrieved, svID))
            if (responses == None) or (respMeta == None):
                continue
            self.__loadDB(responses, 'response')
            self.__loadDB(respMeta, 'response_metadata')
class QualtricsExtractor(MySQLDB):

    def __init__(self):
        '''
        Initializes extractor object with credentials from .ssh directory.
        Set log file directory.
        '''
        home = expanduser("~")
        userFile = home + '/.ssh/qualtrics_user'
        tokenFile = home + '/.ssh/qualtrics_token'
        dbFile = home + "/.ssh/mysql_user"
        if os.path.isfile(userFile) == False:
            sys.exit("User file not found: " + userFile)
        if os.path.isfile(tokenFile) == False:
            sys.exit("Token file not found: " + tokenFile)
        if os.path.isfile(dbFile) == False:
            sys.exit("MySQL user credentials not found: " + dbFile)

        self.apiuser = None
        self.apitoken = None
        dbuser = None
        dbpass = None

        with open(userFile, 'r') as f:
            self.apiuser = f.readline().rstrip()

        with open(tokenFile, 'r') as f:
            self.apitoken = f.readline().rstrip()

        with open(dbFile, 'r') as f:
            dbuser = f.readline().rstrip()
            dbpass = f.readline().rstrip()

        logging.basicConfig(filename="EdxQualtricsETL_%d%d%d_%d%d.log" % (dt.datetime.today().year, dt.datetime.today().month, dt.datetime.today().day, dt.datetime.now().hour, dt.datetime.now().minute),
                            level=logging.INFO)

        self.lookup = IpCountryDict()

        MySQLDB.__init__(self, db="EdxQualtrics", user=dbuser, passwd=dbpass)

## Database setup helper methods for client

    def resetSurveys(self):
        self.execute("DROP TABLE IF EXISTS `choice`;")
        self.execute("DROP TABLE IF EXISTS `question`;")

        choiceTbl = ("""
                        CREATE TABLE IF NOT EXISTS `choice` (
                          `SurveyId` varchar(50) DEFAULT NULL,
                          `QuestionId` varchar(50) DEFAULT NULL,
                          `ChoiceId` varchar(50) DEFAULT NULL,
                          `description` varchar(3000) DEFAULT NULL
                        ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                        """)

        questionTbl = ("""
                        CREATE TABLE IF NOT EXISTS `question` (
                          `SurveyID` varchar(50) DEFAULT NULL,
                          `QuestionID` varchar(5000) DEFAULT NULL,
                          `QuestionDescription` varchar(5000) DEFAULT NULL,
                          `ForceResponse` varchar(50) DEFAULT NULL,
                          `QuestionType` varchar(50) DEFAULT NULL,
                          `QuestionNumber` varchar(50) DEFAULT NULL
                        ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                        """)

        self.execute(choiceTbl)
        self.execute(questionTbl)

    def resetResponses(self):
        self.execute("DROP TABLE IF EXISTS `response`;")
        self.execute("DROP TABLE IF EXISTS `response_metadata`;")
        self.execute("DROP VIEW IF EXISTS `RespondentMetadata`;")

        responseTbl = ("""
                        CREATE TABLE IF NOT EXISTS `response` (
                          `SurveyId` varchar(50) DEFAULT NULL,
                          `ResponseId` varchar(50) DEFAULT NULL,
                          `QuestionNumber` varchar(50) DEFAULT NULL,
                          `AnswerChoiceId` varchar(500) DEFAULT NULL,
                          `Description` varchar(5000) DEFAULT NULL
                        ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                        """)

        responseMetaTbl = ("""
                            CREATE TABLE IF NOT EXISTS `response_metadata` (
                              `SurveyID` varchar(50) DEFAULT NULL,
                              `ResponseID` varchar(50) DEFAULT NULL,
                              `Name` varchar(1200) DEFAULT NULL,
                              `EmailAddress` varchar(50) DEFAULT NULL,
                              `IPAddress` varchar(50) DEFAULT NULL,
                              `StartDate` datetime DEFAULT NULL,
                              `EndDate` datetime DEFAULT NULL,
                              `ResponseSet` varchar(50) DEFAULT NULL,
                              `Language` varchar(50) DEFAULT NULL,
                              `ExternalDataReference` varchar(200) DEFAULT NULL,
                              `a` varchar(200) DEFAULT NULL,
                              `UID` varchar(200) DEFAULT NULL,
                              `userid` varchar(200) DEFAULT NULL,
                              `anon_screen_name` varchar(200) DEFAULT NULL,
                              `advance` varchar(200) DEFAULT NULL,
                              `Country` varchar(50) DEFAULT NULL,
                              `Finished` varchar(50) DEFAULT NULL,
                              `Status` varchar (200) DEFAULT NULL
                            ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                            """)

        respondentView = ("""
                            CREATE VIEW IF NOT EXISTS `RespondentMetadata` (
                                (SurveyId, ResponseId, anon_screen_name, Country, StartDate, EndDate)
                            AS SELECT
                                SurveyID AS SurveyId,
                                ResponseID AS ResponseId,
                                anon_screen_name,
                                Country,
                                StartDate,
                                EndDate
                            FROM response_metadata;
                           """)

        self.execute(responseTbl)
        self.execute(responseMetaTbl)
        self.execute(respondentView)

    def resetMetadata(self):
        self.execute("DROP TABLE IF EXISTS `survey_meta`;")

        surveyMeta = ("""
                        CREATE TABLE IF NOT EXISTS `survey_meta` (
                          `SurveyId` varchar(50) DEFAULT NULL,
                          `SurveyCreationDate` datetime DEFAULT NULL,
                          `UserFirstName` varchar(200) DEFAULT NULL,
                          `UserLastName` varchar(200) DEFAULT NULL,
                          `SurveyName` varchar(2000) DEFAULT NULL,
                          `responses` varchar(50) DEFAULT NULL,
                          `responses_actual` int DEFAULT NULL,
                          `course_display_name` varchar(255) DEFAULT NULL
                        ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                        """)

        self.execute(surveyMeta)

## API extractor methods

    def __getSurveyMetadata(self):
        '''
        Pull survey metadata from Qualtrics API v2.4. Returns JSON object.
        '''
        url = "https://stanforduniversity.qualtrics.com/WRAPI/ControlPanel/api.php?API_SELECT=ControlPanel&Version=2.4&Request=getSurveys&User=%s&Token=%s&Format=JSON&JSONPrettyPrint=1" % (self.apiuser, self.apitoken)
        data = json.loads(urllib2.urlopen(url).read())
        return data

    def __genSurveyIDs(self, forceLoad=False):
        '''
        Generator for Qualtrics survey IDs. Generates only IDs for surveys with
        new data to pull from Qualtrics unless user specifies that load should be
        forced.
        '''
        data = self.__getSurveyMetadata()
        surveys = data['Result']['Surveys']
        total = len(surveys)
        logging.info("Extracting %d surveys from Qualtrics..." % total)

        for idx, sv in enumerate(surveys):
            svID = sv['SurveyID']
            logging.info("Processing survey %d out of %d total: %s" % (idx + 1, total, svID))
            if forceLoad:
                yield svID
                continue

            payload = int(sv.pop('responses', 0))
            logging.info(" Found %d responses." % payload)
            existing = (self.__numResponses(svID) or 0)
            logging.info(" Have %d responses already." % existing)
            if (existing < payload) or forceLoad:
                yield svID
            else:
                logging.info("  Survey %s yielded no new data." % svID)
                continue

    def __getSurvey(self, surveyID):
        '''
        Pull survey data for given surveyID from Qualtrics API v2.4. Returns XML string.
        '''
        url = "https://stanforduniversity.qualtrics.com//WRAPI/ControlPanel/api.php?API_SELECT=ControlPanel&Version=2.4&Request=getSurvey&User=%s&Token=%s&SurveyID=%s" % (self.apiuser, self.apitoken, surveyID)
        data = urllib2.urlopen(url).read()
        return ET.fromstring(data)

    def __getResponses(self, surveyID):
        '''
        Pull response data for given surveyID from Qualtrics. Method generates
        JSON objects containing batches of 5000 surveys.
        '''

        urlTemp = Template("https://dc-viawest.qualtrics.com:443/API/v1/surveys/${svid}/responseExports?apiToken=${tk}&fileType=JSON")
        reqURL = urlTemp.substitute(svid=surveyID, tk=self.apitoken)
        req = json.loads(urllib2.urlopen(reqURL).read())

        statURL = req['result']['exportStatus'] + "?apiToken=" + self.apitoken
        percent, tries = 0, 0
        while percent != 100 and tries < 20:
            time.sleep(5)  # Wait 5 seconds between attempts to acquire data
            try:
                stat = json.loads(urllib2.urlopen(statURL).read())
                percent = stat['result']['percentComplete']
            except:
                logging.warning(" Recovered from HTTP error.")
                continue
            finally:
                tries += 1
        if tries >= 20:
            logging.error("  Survey %s timed out." % surveyID)
            return None

        dataURL = stat['result']['fileUrl']
        remote = urllib2.urlopen(dataURL).read()
        dataZip = sio.StringIO(remote)
        archive = z.ZipFile(dataZip, 'r')
        dataFile = archive.namelist()[0]
        data = json.loads(archive.read(dataFile), object_pairs_hook=OrderedDict)

        if not data['responses']:
            return None
        else:
            return data


## Helper methods for interfacing with DB

    def __assignCDN(self, survey, surveyID):
        '''
        Given a survey from Qualtrics, finds embedded field 'c' and returns field value.
        Field contains unique course identifier from platform, for mapping to course log data.
        '''
        try:
            cdn = None
            embedded = survey.findall('./EmbeddedData/Field')
            for ef in embedded:
                if ef.find('Name').text == 'c':
                    cdn = ef.find('Value').text
        except AttributeError as e:
            logging.warning("%s course identifier not resolved: %s" % (surveyID, e))
        query = "UPDATE survey_meta SET course_display_name='%s' WHERE SurveyId='%s'" % (cdn, surveyID)
        self.execute(query.encode('UTF-8', 'ignore'))

    def __isLoaded(self, svID):
        '''
        Checks survey_meta table for given surveyID. Returns 1 if loaded, 0 otherwise.
        '''
        return self.query("SELECT count(*) FROM survey_meta WHERE SurveyID='%s'" % svID).next()[0]

    def __numResponses(self, svID):
        '''
        Given a survey ID, fetches number of responses loaded from survey_meta table.
        '''
        return self.query("SELECT responses_actual FROM survey_meta WHERE SurveyID='%s'" % svID).next()[0]

    def __getAnonUserID(self, uid):
        '''
        Given a userID from Qualtrics, returns translated anon user ID from platform data.
        '''
        q = "SELECT edxprod.idExt2Anon('%s')" % uid
        auid = ''
        try:
            auid = self.query(q).next()[0]
        except:
            auid = 'ERROR'
        return auid


## Transform methods

    def __parseSurveyMetadata(self, rawMeta):
        '''
        Given survey metadata for active user, returns a dict of dicts mapping
        column names to values for each survey. Skips over previously loaded surveys.
        '''
        svMeta = []
        for sv in rawMeta:
            keys = ['SurveyID', 'SurveyName', 'SurveyCreationDate', 'UserFirstName', 'UserLastName', 'responses']
            data = dict()
            svID = sv['SurveyID']
            if self.__isLoaded(svID):
                continue
            for key in keys:
                try:
                    val = sv[key].replace('"', '')
                    data[key] = val
                except KeyError as k:
                    data[k[0]] = 'NULL'  # Set value to NULL if no data found
            svMeta.append(data)  # Finally, add row to master dict
        return svMeta

    def __parseSurvey(self, svID):
        '''
        Given surveyID, parses survey from Qualtrics and returns:
         1. a dict mapping db column names to values corresponding to survey questions
         2. a dict of dicts mapping db column names to choices for each question
        Method expects an XML ElementTree object corresponding to a single survey.
        '''
        # Get survey from surveyID
        sv = None
        try:
            sv = self.__getSurvey(svID)
        except urllib2.HTTPError:
            logging.warning("Survey %s not found." % svID)
            return None, None

        masterQ = dict()
        masterC = dict()

        # Handle PodioID mapping in survey_meta table
        self.__assignCDN(sv, svID)

        # Parse data for each question
        questions = sv.findall('./Questions/Question')
        for idx, q in enumerate(questions):
            parsedQ = dict()
            qID = q.attrib['QuestionID']
            parsedQ['SurveyID'] = svID
            parsedQ['QuestionID'] = qID
            parsedQ['QuestionNumber'] = q.find('ExportTag').text
            parsedQ['QuestionType'] = q.find('Type').text
            try:
                parsedQ['ForceResponse'] = q.find('Validation/ForceResponse').text
            except:
                parsedQ['ForceResponse'] = 'NULL'
            try:
                text = q.find('QuestionDescription').text.replace('"', '')
                if len(text) > 2000:
                    text = text[0:2000]
                parsedQ['QuestionDescription'] = text
            except:
                parsedQ['QuestionDescription'] = 'NULL'

            masterQ[idx] = parsedQ

            # For each question, load all choices
            choices = q.findall('Choices/Choice')
            for c in choices:
                parsedC = dict()
                cID = c.attrib['ID']
                parsedC['SurveyID'] = svID
                parsedC['QuestionID'] = qID
                parsedC['ChoiceID'] = cID
                cdesc = c.find('Description').text
                parsedC['Description'] = cdesc.replace("'", "").replace('"', '') if (cdesc is not None) else 'N/A'
                masterC[qID + cID] = parsedC

        return masterQ, masterC

    def __parseResponses(self, svID):
        '''
        Given a survey ID, parses responses from Qualtrics and returns:
        1. A list of dicts containing response metadata
        2. A list of dicts containing question responses
        Method expects a JSON formatted object with raw survey data.
        '''
        # Get responses from Qualtrics-- try multiple times to ensure API request goes through
        rsRaw = None
        for x in range(0, 10):
            try:
                rsRaw = self.__getResponses(svID)
                break
            except urllib2.HTTPError as e:
                logging.error("  Survey %s gave error '%s'." % (svID, e))
                if e.getcode() == '400':
                    continue
                else:
                    return None, None

        # Return if API gave us no data
        if rsRaw is None:
            logging.info("  Survey %s gave no responses." % svID)
            return None, None

        # Get total expected responses
        rq = 'SELECT `responses` FROM survey_meta WHERE SurveyID = "%s"' % svID
        rnum = self.query(rq).next()

        logging.info(" Parsing %s responses from survey %s..." % (len(rsRaw['responses']), svID))

        responses = []
        respMeta = []
        rsID = None

        for rs in rsRaw['responses']:
            rsID = rs.pop('ResponseID', 'NULL')
            # Get response metadata for each response
            # Method destructively reads question fields
            rm = dict()
            rm['SurveyID'] = svID
            rm['ResponseID'] = rsID
            rm['Name'] = rs.pop('RecipientFirstName', 'NULL') + rs.pop('RecipientLastName', 'NULL')
            rm['EmailAddress'] = rs.pop('RecipientEmail', 'NULL')
            rm['IPAddress'] = rs.pop('IPAddress', 'NULL')
            rm['StartDate'] = rs.pop('StartDate', 'NULL')
            rm['EndDate'] = rs.pop('EndDate', 'NULL')
            rm['ResponseSet'] = rs.pop('ResponseSet', 'NULL')
            rm['Language'] = rs.pop('Q_Language', 'NULL')
            rm['ExternalDataReference'] = rs.pop('ExternalDataReference', 'NULL')
            rm['a'] = rs.pop('a', 'NULL')
            rm['UID'] = rs.pop('uid', 'NULL')
            rm['userid'] = rs.pop('user_id', 'NULL')  # NOTE: Not transformed, use unclear
            if(len(rm['UID']) >= 40):
                rm['anon_screen_name'] = rm['UID']
            elif (len(rm['a']) >= 32):
                rm['anon_screen_name'] = self.__getAnonUserID(rm['a'])
            if (len(rm['IPAddress']) in range(8, 16)):
                rm['Country'] = self.lookup.lookupIP(rm['IPAddress'])[1]
            elif (len(rm['UID']) in range(8, 16)):
                rm['Country'] = self.lookup.lookupIP(rm['UID'])[1]
            rm['advance'] = rs.pop('advance', 'NULL')
            rm['Finished'] = rs.pop('Finished', 'NULL')
            rm['Status'] = rs.pop('Status', 'NULL')
            del rs['LocationLatitude']
            del rs['LocationLongitude']
            respMeta.append(rm)

            # Parse remaining fields as question answers
            fields = rs.keys()
            for q in fields:
                qs = dict()
                if 'Q' and '_' in q:
                    qSplit = q.split('_')
                    qNum = qSplit[0]
                    cID = qSplit[1]
                else:
                    qNum = q
                    cID = 'NULL'
                qs['SurveyID'] = svID
                qs['ResponseID'] = rsID
                qs['QuestionNumber'] = qNum
                qs['AnswerChoiceID'] = cID
                desc = rs[q].replace('"', '').replace("'", "").replace('\\', '').lstrip('u')
                if len(desc) >= 5000:
                    desc = desc[:5000]  # trim past max field length
                qs['Description'] = desc
                responses.append(qs)

        return responses, respMeta


## Convenience method for handling query calls to MySQL DB.

    def __loadDB(self, data, tableName):
        '''
        Convenience function for writing data to named table. Expects data to be
        represented as a list of dicts mapping column names to values.
        '''
        try:
            columns = tuple(data[5].keys())
            table = []
            # logging.info("     " + ", ".join(columns))
            for row in data:
                vals = tuple(row.values())
                # logging.info("     " + ", ".join(vals))
                table.append(vals)
            self.bulkInsert(tableName, columns, table)
        except Exception as e:
            logging.error("  Insert query failed: %s" % e)


## Build indexes.

    def buildIndexes(self):
        '''
        Build indexes over survey tables.
        '''
        dropIndexes = """
                    DROP INDEX idxSurveyMetaSurveyId
                    ON EdxQualtrics.survey_meta;

                    DROP INDEX idxSurveyMetaSurvNm
                    ON EdxQualtrics.survey_meta;

                    DROP INDEX idxSurveyMetaCrsName
                    ON EdxQualtrics.survey_meta;

                    DROP INDEX idxQuestionSurveyId
                    ON EdxQualtrics.question;

                    DROP FULLTEXT INDEX idxQuestionDescription
                    ON EdxQualtrics.question;

                    DROP INDEX idxRespMetaSurveyId
                    ON EdxQualtrics.response_metadata;

                    DROP INDEX idxRespSurveyId
                    ON EdxQualtrics.response;

                    DROP INDEX idxRespChoiceId
                    ON EdxQualtrics.response;

                    DROP INDEX idxRespMetaAnonScrnNm
                    ON EdxQualtrics.response_metadata;

                    DROP FULLTEXT INDEX idResponseDescription
                    ON response;

                    DROP INDEX idResponseQuestionNum
                    ON response;"""

        buildIndexes = """
                    CREATE INDEX idxSurveyMetaSurveyId
                    ON EdxQualtrics.survey_meta (SurveyID);

                    CREATE INDEX idxSurveyMetaSurvNm
                    ON EdxQualtrics.survey_meta (SurveyName(100));

                    CREATE INDEX idxSurveyMetaCrsName
                    ON EdxQualtrics.survey_meta (course_display_name(255));

                    CREATE INDEX idxQuestionSurveyId
                    ON EdxQualtrics.question (SurveyID);

                    CREATE FULLTEXT INDEX idxQuestionDescription
                    ON EdxQualtrics.question (QuestionDescription);

                    CREATE INDEX idxRespMetaSurveyId
                    ON EdxQualtrics.response_metadata (SurveyID);

                    CREATE INDEX idxRespSurveyId
                    ON EdxQualtrics.response (SurveyID);

                    CREATE INDEX idxRespChoiceId
                    ON EdxQualtrics.response (AnswerChoiceId);

                    CREATE INDEX idxRespMetaAnonScrnNm
                    ON EdxQualtrics.response_metadata (anon_screen_name);

                    CREATE FULLTEXT INDEX idResponseDescription
                    ON response (Description);

                    CREATE INDEX idResponseQuestionNum
                    ON response (QuestionNumber);"""

        self.execute(dropIndexes)
        self.execute(buildIndexes)

## Client data load methods

    def loadSurveyMetadata(self):
        '''
        Client method extracts and transforms survey metadata and loads to MySQL
        database using query interface inherited from MySQLDB class.
        '''
        rawMeta = self.__getSurveyMetadata()
        svMeta = rawMeta['Result']['Surveys']
        parsedSM = self.__parseSurveyMetadata(svMeta)
        if len(parsedSM) > 0:
            self.__loadDB(parsedSM, 'survey_meta')

    def loadSurveyData(self):
        '''
        Client method extracts and transforms survey questions and question
        choices and loads to MySQL database using MySQLDB class methods.
        '''
        sids = self.__genSurveyIDs(forceLoad=True)
        for svID in sids:
            questions, choices = self.__parseSurvey(svID)
            if (questions is None) and (choices is None):
                continue
            self.__loadDB(questions.values(), 'question')
            self.__loadDB(choices.values(), 'choice')

    def loadResponseData(self, startAfter=0):
        '''
        Client method extracts and transforms response data and response metadata
        and loads to MySQL database using MySQLDB class methods. User can specify
        where to start in the list of surveyIDs.
        '''
        sids = self.__genSurveyIDs()
        for idx, svID in enumerate(sids):
            if idx < startAfter:
                logging.info("  Skipped surveyID %s" % svID)
                continue  # skip first n surveys
            responses, respMeta = self.__parseResponses(svID)
            retrieved = len(respMeta) if respMeta is not None else 0
            logging.info(" Inserting %d responses on survey %s to database." % (retrieved, svID))
            self.execute("UPDATE survey_meta SET responses_actual='%d' WHERE SurveyID='%s'" % (retrieved, svID))
            if (responses is None) or (respMeta is None):
                continue
            self.__loadDB(responses, 'response')
            self.__loadDB(respMeta, 'response_metadata')