def unify(cur):
    """
    When a new table comes in, reduce it using reduce_log() and then run this function
    to incorporate it into the unified table, along with partitioning
    """
    
    cur.execute("USE reduced_log")
    cur.execute('SHOW TABLES')
    tables = set(x for x, in cur.fetchall())
    
    cur.execute("""SELECT PARTITION_NAME
                   FROM INFORMATION_SCHEMA.PARTITIONS
                   WHERE TABLE_SCHEMA = 'reduced_log'
                         AND TABLE_NAME = 'unified'""")
    partitions = set(x for x, in cur.fetchall())

    tables_to_add = sorted(tables - partitions - set(['unified', 'users', 'servers', 'unified_users', 'unified_servers']))
    
    for table in tables_to_add:
        print_and_execute("""ALTER TABLE unified REORGANIZE PARTITION other INTO ({0},
                  PARTITION other VALUES LESS THAN MAXVALUE)""".format(partition_from_str(table)), cur)
        print_and_execute("INSERT INTO unified SELECT * FROM {0}".format(table), cur)
def create_unified(cur):
    """
    When the reduced_log.unified table does not exist, or when the schema
    changes, run this function to regenerate it.

    @Precondition: all the tables in @initial_tables must be in the reduced log
    """

    cur.execute("USE reduced_log")

    # Get list of 2 initial tables
    cur.execute("SHOW TABLES")
    initial_tables = [x for x, in cur.fetchall()][:2]

    print_and_execute("CREATE TABLE unified {0}".format(" UNION ALL ".join("SELECT * FROM {0}".format(t) for t in initial_tables)), cur)
    print_and_execute("ALTER TABLE unified ADD INDEX (userid)", cur)
    print_and_execute("ALTER TABLE unified ADD INDEX (serverid)", cur)
    print_and_execute("ALTER TABLE unified ADD INDEX (event_time)", cur)

    
    print_and_execute("ALTER TABLE unified PARTITION BY RANGE( TO_DAYS(event_time) ) ( " + 
                      ", ".join(partition_from_str(t) for t in initial_tables) + 
                      ", PARTITION other VALUES LESS THAN MAXVALUE" + ")", cur)
Example #3
0
    def create_new_temp_table(self):
        # Create a temp table
        lasttable = self.current_table_name()
        nexttable = self.next_table_name()

        # Create main table
        self.cur.execute("DROP TABLE IF EXISTS {0}".format(nexttable))
        print_and_execute("CREATE TEMPORARY TABLE {0} AS {1}".format(nexttable,
                                                                     self.fil.sql(lasttable)),
                          self.cur)
        print_and_execute("ALTER TABLE {0} ADD INDEX (userid)".format(nexttable), self.cur)
        print_and_execute("ALTER TABLE {0} ADD INDEX (serverid)".format(nexttable), self.cur)

        self.last_used_fil = self.fil
def reduce_log(tablename, cur):

    print >>sys.stderr, "Reducing general_log.{0} and storing into reduced_log".format(tablename)
    print >>sys.stderr, "Selecting results..."

    cur.execute("USE reduced_log")
    print_and_execute("SELECT user, userid FROM users", cur)
    users = dict(cur.fetchall())
    usernum = max(users.values()) + 1 if users.values() else 0 # first open usernum
    newusers = []

    print_and_execute("SELECT server, serverid FROM servers", cur)
    servers = dict(cur.fetchall())
    servernum = max(servers.values()) + 1 if servers.values() else 0 # first open servernum
    newservers = []

    cur.execute('USE general_log')
    print_and_execute("""SELECT * FROM {0} WHERE command_type IN ('Execute', 'Query')""".format(tablename), cur)

    print >>sys.stderr, "Selected results, cleaning queries and writing temp file..."

    temp_filename = '{0}_reduced.tmp'.format(tablename)
    outfile = open(temp_filename, 'w')

    for event_time, user_host, thread_id, server_id, command_type, query in cur:

        cleaned_query = clean(query, reserved_words)
        # Clean the query some more: remove numlists, replace constants
        cleaned_query = numlist_re.sub(numlist_sub_fcn, cleaned_query)
        try:
            user, server, cleaned_query = reducer.accept(user_host, cleaned_query)
        except TypeError:
            continue
        cleaned_query, vals = repl_constants(cleaned_query)
        vals = ' ~ '.join(vals)

        if user not in users:
            users[user] = usernum
            newusers.append(user)
            usernum += 1
        if server not in servers:
            servers[server] = servernum
            newservers.append(server)
            servernum += 1
            
        if cleaned_query.startswith('INSERT INTO'):
            query_type = 'INSERT'
            if values_re.search(cleaned_query):
                #TODO: count number of rows inserted. Nontrivial because of parens, commas, quotes, etc.
                cleaned_query = insert_re.match(cleaned_query).group(0) + ' <values>'
                vals = ''
        elif cleaned_query.startswith('SELECT'):
            query_type = 'SELECT'
        elif cleaned_query.startswith('CREATE TABLE'):
            # Replacing schemas with length + hash doesn't help much.
            # There aren't many create table statements (~1%)
            # cleaned_query = re.sub(r'\(.*\)',
            #                        lambda x: '<schema len={0}, hash={1}>'.format(x.group().count(',')+1, x.group().__hash__()),
            #                        cleaned_query)
            query_type = 'CREATE_TABLE'
        elif cleaned_query.startswith('SET'):
            query_type = 'SET'
        elif cleaned_query.startswith('LOAD DATA'):
            query_type = 'LOAD'
        elif cleaned_query.startswith('ALTER'):
            query_type = 'ALTER'
        else:
            query_type = 'OTHER'

        #we ignore server_id because it's always 0...
        cleaned_query = repr(cleaned_query)[1:-1] #deal with \n and others
        final = event_time, users[user], servers[server], thread_id, query_type, cleaned_query, vals
        print >>outfile, '\t'.join(str(s) for s in final)
        
    outfile.close()

    print >>sys.stderr, "Wrote temp file, loading data into {0} table...".format(tablename)

    cur.execute("USE reduced_log")
    cur.execute("""CREATE TABLE {0} (event_time DATETIME,
                                     userid INT,
                                     serverid INT,
                                     thread_id INT(11),
                                     query_type ENUM{1},
                                     query MEDIUMTEXT,
                                     vals MEDIUMTEXT,
                                     INDEX (userid),
                                     INDEX (serverid),
                                     INDEX (event_time)
                                    )""".format(tablename, querytypes))

    cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}".format(temp_filename, tablename))
    os.remove(temp_filename)

    print >>sys.stderr, "Loaded data and removed temp file. Adding into users table..."

    for user in newusers:
        cur.execute("INSERT INTO users VALUES ('{0}', {1})".format(user, users[user]))
    
    print >>sys.stderr, "Added into users table, adding into servers table..."

    for server in newservers:
        cur.execute("INSERT INTO servers VALUES ('{0}', {1})".format(server, servers[server]))

    db.commit()

    print >>sys.stderr, "Added into servers table. Defining time functions..."

    # This redefines the time fcns for every table reduced, but that's a small cost
    define_time_functions(cur)

    print >>sys.stderr, "Defined time functions. Reduction complete"
def query_profile(tablename, numtop, period, cur):
    """
    Generate profiles of the queries in @tablename
    """

    # We are assuming the db already has the time functions defined.
    # This is one of the actions in the create reduced log table
    # define_time_functions(cur)

    print_and_execute("""SELECT user, time, query_type, count
                         FROM (SELECT userid, my_{1}(event_time) AS time,
                                      query_type, count(*) AS count
                               FROM {0}
                               GROUP BY userid, time, query_type
                              ) AS sth
                            NATURAL JOIN users
                      """.format(tablename, period), cur)

    peruser_divided = defaultdict(dict)
    peruser_alltime = dict()
    full_divided = dict()
    full_alltime = defaultdict(int)

    for user, time, query_type, count in cur.fetchall():
        # print user, time, query_type, count

        if time not in peruser_divided[user]:
            peruser_divided[user][time] = defaultdict(int)
        peruser_divided[user][time][query_type] = count
        
        if user not in peruser_alltime:
            peruser_alltime[user] = defaultdict(int)
        peruser_alltime[user][query_type] += count
        
        if time not in full_divided:
            full_divided[time] = defaultdict(int)
        full_divided[time][query_type] += count

        full_alltime[query_type] += count
    
    #sort them by time and make into (ordered) list of tuples
    full_divided = sorted([(k, v) for (k, v) in full_divided.iteritems()], key=itemgetter(0))
    for user in peruser_divided.keys():
        peruser_divided[user] = sorted([(k, v) for (k, v) in peruser_divided[user].iteritems()],
                                       key=itemgetter(0))

    print_and_execute("""SELECT user, query, vals FROM {0} NATURAL JOIN users
                      """.format(tablename), cur)

    full_topqueries = defaultdict(dict)
    peruser_topqueries = defaultdict(dict)
    for user, query, vals in cur.fetchall():
        full_query = full_topqueries[query]
        if vals not in full_query:
            full_query[vals] = 0
        full_query[vals] += 1

        peruser_user = peruser_topqueries[user]
        if query not in peruser_user:
            peruser_user[query] = {}
        if vals not in peruser_user[query]:
            peruser_user[query][vals] = 0
        peruser_user[query][vals] += 1

    print "stored queries"

    for user in peruser_topqueries: #takes forever
        peruser_topqueries[user] = map(lambda x: (x[0], sorted(x[1].iteritems(),
                                                               key=itemgetter(1),
                                                               reverse=True)),
                                       sorted(peruser_topqueries[user].iteritems(),
                                              key=lambda x: sum(ct for val, ct in x[1].iteritems()),
                                              reverse=True)
                                   )[:numtop]
    print "sorted each user"
    #takes no time:
    peruser_topqueries = sorted(peruser_topqueries.iteritems(),
                                key = lambda x: sum( sum(ct for val, ct in valcts) 
                                                    for query, valcts in x[1] ),
                                reverse = True)

    print "sorted peruser_topqueries"
    #takes a long time:
    full_topqueries = map(lambda x: (x[0], sorted(x[1].iteritems(), key=itemgetter(1), reverse=True)),
                          sorted([(q, c) for (q, c) in full_topqueries.iteritems()],
                                 key = lambda x: sum(count for vals, count in x[1].iteritems()),
                                 reverse=True))[:numtop]

    print "sorted full_topqueries"
    return peruser_divided, peruser_alltime, full_divided, full_alltime, full_topqueries, peruser_topqueries
Example #6
0
    def run(self, target_db, source_db = 'general_log'):

        sort_schema, sql_sort_by = self._sort_schema()
        sql_where = ( "WHERE " + " AND ".join('(' + sel + ')'
                                              for sel in self.selectors) ) \
                                                  if self.selectors else ''

        final_selector = itemgetter(*[colname for colname, coltype
                                      in self.outputs])

        conn = mysql.connect(**{"host": "localhost",
                                "user": "******",
                                "passwd": "",
                                "unix_socket": "/u1/vbar/mysql/thesock",
                                "cursorclass": DictCursor})
        cur = conn.cursor()
            
        for table in self._tables_to_reduce(target_db, source_db):
            cur.execute("USE {0}".format(source_db))
            sql_full = "SELECT * FROM {0} {1} {2}".format(table,
                                                          sql_where,
                                                          sql_sort_by)

            print_and_execute(sql_full, cur)

            rows = filter(self._all_prefilters_pass, cur.fetchall())

            # TODO: can avoid using DictCursor by changing the above line

            for group in sort_schema:
                newrows = []
                if group[0].sort_column:
                    rows.sort(key=itemgetter(group[0].sort_column),
                              reverse=group[0].sort_reverse)
                for row in rows:
                    try:
                        for processor in group:
                            row.update(zip(processor.outputs,
                                           processor.process(row)))
                        newrows.append(row)
                    except SkipRowException:
                        continue
                print "Done with one group of processors"
                rows = newrows

            print "All done processing, writing temp file and loading into table"
            temp_filename = '{0}.tmp'.format(table)
            with open(temp_filename, 'w') as outfile:
                print >>outfile, '\n'.join('\t'.join(str(x) for x in
                                                     final_selector(row))
                                           for row in rows)

            cur.execute("USE {0}".format(target_db))
            cur.execute("CREATE TABLE {0} (".format(table) + \
                        ',\n'.join("{0} {1}".format(col, typ)
                                   for col, typ in self.outputs) + \
                        ")")
            cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}"
                        .format(temp_filename, table))
            
            os.remove(temp_filename)

        conn.commit()
        cur.close()
        conn.close()
Example #7
0
    def create_checkbox_lists(self, initial=False):
        """
        Removes the current user and server checkbox panels from the window,
        if they exist (if they don't, they will be None, from __init__())
        creates new ones with data from the current table, then adds them
        to the window again. If @initial is True, only the last partition
        (month) of the 'unified' table will be used for counts, but all
        users and servers will still be shown

        @initial - if this is the first time the checkbox lists are being
                   generated, it works a bit differently: the counts are
                   taken from just the last partition of the unified table
                   so that startup doesn't take forever. We then also need
                   to grab the names of other users who didn't appear in
                   this first partition
        """

        # Remove current user and server checkbox panels from the window
        if self.user_panel:
            self.window.remove(self.user_panel)
        if self.server_panel:
            self.window.remove(self.server_panel)

        # Create user filter checkboxes
        if initial:
            print_and_execute("""SELECT PARTITION_NAME
                                 FROM INFORMATION_SCHEMA.PARTITIONS
                                 WHERE TABLE_SCHEMA = 'reduced_log'
                                       AND TABLE_NAME = 'unified'""", self.cur)
            # there will be each month, then the 'other' partition, so we want to
            # select from the 2nd to last partition
            last_partition = [x for x, in self.cur.fetchall()][-2]
            table_to_use = "unified PARTITION({0})".format(last_partition)
        else:
            table_to_use = self.current_table_name()

        print_and_execute("""SELECT userid, user, count
                             FROM (SELECT userid, COUNT(*) AS count
                                   FROM {0} GROUP BY userid
                                  ) AS sth
                               NATURAL JOIN users
                             ORDER BY count DESC
                          """.format(table_to_use), self.cur)
        userlist = [x for x in self.cur.fetchall()]

        x_pos = 0
        y_pos = 0
        y_spacing = 20
        self.user_checkboxes = {}
        size_width = 220
        extent_width = size_width
        for userid, user, count in userlist:
            self.user_checkboxes[user] = CheckBox("{0} ({1})".format(user.replace('_', '_ '),
                                                                     count),
                                                  position = (x_pos, y_pos),
                                                  value = True)
            extent_width = max(self.user_checkboxes[user].size[0], extent_width)
            y_pos += y_spacing

        # Get users that didn't appear in the last partition, if @initial
        if initial:
            self.cur.execute("SELECT user, userid FROM users")
            for user, userid in self.cur.fetchall():
                if user in self.user_checkboxes:
                    continue
                self.user_checkboxes[user] = CheckBox(user.replace('_', '_ ') + " (0)",
                                                      position = (x_pos, y_pos),
                                                      value = True)
                extent_width = max(self.user_checkboxes[user].size[0], extent_width)
                y_pos += y_spacing

        # Add the user checkboxes to a ScrollableView:
        self.user_panel = ScrollableView(size = (size_width, 150),
                                         extent = (extent_width,
                                                   max(150, y_pos)),
                                         scrolling = 'v' if extent_width <= size_width else 'hv')
        for cbox in self.user_checkboxes.values():
            self.user_panel.add(cbox)
        # Add the panel to the window
        self.window.place(self.user_panel, top = top,
                          left = self.query_type_panel + horiz_sp)

        # Create server filter checkboxes
        print_and_execute("""SELECT serverid, server, count
                             FROM (SELECT serverid, COUNT(*) AS count
                                   FROM {0} GROUP BY serverid
                                  ) as sth
                               NATURAL JOIN servers
                             ORDER BY count DESC
                         """.format(table_to_use), self.cur)
        serverlist = [x for x in self.cur.fetchall()]

        x_pos = 0
        y_pos = 0
        y_spacing = 20
        self.server_checkboxes = {}
        size_width = 300
        extent_width = size_width
        for serverid, server, count in serverlist:
            self.server_checkboxes[server] = CheckBox("{0} ({1})".format(server,
                                                                         count),
                                                      position = (x_pos, y_pos),
                                                      value = True)
            extent_width = max(self.server_checkboxes[server].size[0], extent_width)
            y_pos += y_spacing

        if initial:
            self.cur.execute("SELECT server FROM servers")
            for server, in self.cur.fetchall():
                if server in self.server_checkboxes:
                    continue
                self.server_checkboxes[server] = CheckBox(server.replace('_', '_ ') + " (0)",
                                                          position = (x_pos, y_pos),
                                                          value = True)
                extent_width = max(self.server_checkboxes[server].size[0], extent_width)
                y_pos += y_spacing

        # Add the server checkboxes to a ScrollableView
        self.server_panel = ScrollableView(size = (size_width, 150),
                                           extent = (extent_width,
                                                     max(150, y_pos)),
                                           scrolling = 'v' if extent_width <= size_width else 'hv')
        for cbox in self.server_checkboxes.values():
            self.server_panel.add(cbox)
        # Add the server panel to the window
        self.window.place(self.server_panel, top = top,
                          left=self.user_panel + 10)
Example #8
0
    def run(self, target_db, source_db='general_log'):

        sort_schema, sql_sort_by = self._sort_schema()
        sql_where = ( "WHERE " + " AND ".join('(' + sel + ')'
                                              for sel in self.selectors) ) \
                                                  if self.selectors else ''

        final_selector = itemgetter(
            *[colname for colname, coltype in self.outputs])

        conn = mysql.connect(
            **{
                "host": "localhost",
                "user": "******",
                "passwd": "",
                "unix_socket": "/u1/vbar/mysql/thesock",
                "cursorclass": DictCursor
            })
        cur = conn.cursor()

        for table in self._tables_to_reduce(target_db, source_db):
            cur.execute("USE {0}".format(source_db))
            sql_full = "SELECT * FROM {0} {1} {2}".format(
                table, sql_where, sql_sort_by)

            print_and_execute(sql_full, cur)

            rows = filter(self._all_prefilters_pass, cur.fetchall())

            # TODO: can avoid using DictCursor by changing the above line

            for group in sort_schema:
                newrows = []
                if group[0].sort_column:
                    rows.sort(key=itemgetter(group[0].sort_column),
                              reverse=group[0].sort_reverse)
                for row in rows:
                    try:
                        for processor in group:
                            row.update(
                                zip(processor.outputs, processor.process(row)))
                        newrows.append(row)
                    except SkipRowException:
                        continue
                print "Done with one group of processors"
                rows = newrows

            print "All done processing, writing temp file and loading into table"
            temp_filename = '{0}.tmp'.format(table)
            with open(temp_filename, 'w') as outfile:
                print >> outfile, '\n'.join('\t'.join(
                    str(x) for x in final_selector(row)) for row in rows)

            cur.execute("USE {0}".format(target_db))
            cur.execute("CREATE TABLE {0} (".format(table) + \
                        ',\n'.join("{0} {1}".format(col, typ)
                                   for col, typ in self.outputs) + \
                        ")")
            cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}".format(
                temp_filename, table))

            os.remove(temp_filename)

        conn.commit()
        cur.close()
        conn.close()