コード例 #1
0
class RandomSubset(object):
    def __init__(self):
        self.mysql = MySQLConnector()
        self.error = ErrorHandling()
        self.execute_one_col = ExecuteGetListOneColumn()
        self.execute = Execute()

    def random_subset_table(self, db, source_table_name, destination_table_name, field, percent=.10,
                            distinct=True):
        """

        This method selects a random sample of records from source table to destination table
    
        :param db: 
        :param source_table_name: 
        :param destination_table_name: 
        :param field: 
        :param percent: 
        :param distinct: 
        """
        self.error.warn("making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s..." % (
            destination_table_name, percent, source_table_name, field))

        self.error.warn("connecting to DATABASE %s..." % db)
        (db_conn, db_cursor, dict_cursor) = self.mysql.db_connect(db)

        self.error.warn("removing destination table if it exists...")
        sql = 'DROP TABLE IF EXISTS %s' % destination_table_name
        self.execute.execute_(db, db_cursor, sql, True)

        self.error.warn("cloning structure of table...")
        sql = 'CREATE TABLE %s LIKE %s' % (destination_table_name, source_table_name)
        self.execute.execute_(db, db_cursor, sql, True)

        is_distinct_text = ' distinct' if distinct else ''
        self.error.warn('grabbing a%s subset (%2.6f percent) of the keys on which to base the new table' % (
            is_distinct_text, 100 * percent))
        sql = 'SELECT DISTINCT(%s) FROM %s' % (field, source_table_name) if distinct else 'SELECT %s FROM %s' % (
            field, source_table_name)
        unique_key_list = self.execute_one_col.execute_get_list_one_col(db, db_cursor, sql, True)

        self.error.warn(str(unique_key_list[1:5]))

        new_keys = sample(unique_key_list, int(floor(len(unique_key_list) * percent)))
        new_keys = map(str, new_keys)

        self.error.warn("populating newly created table, TABLE %s" % destination_table_name)
        populate_query = "INSERT INTO %s SELECT * FROM %s WHERE %s IN (%s)" % (
            destination_table_name, source_table_name, field, ','.join(new_keys))
        self.execute.execute_(db, db_cursor, populate_query, False)

        self.error.warn("finished making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s!" % (
            destination_table_name, percent, source_table_name, field))
コード例 #2
0
class QuickExecuteGetListOneCol(object):
    def __init__(self):
        self.mysql = MySQLConnector()
        self.execute = ExecuteGetListOneColumn()

    def quick_execute_get_list_one_col(self, db, sql, warn_query=False):
        """
        performs the db connect and execute in the same call, equivalent to execute_get_list1
        :param db:
        :param sql:
        :param warn_query:
        """
        (db_conn, db_cursor, dict_cursor) = self.mysql.db_connect(db)
        return self.execute.execute_get_list_one_col(db, db_cursor, sql, warn_query)
コード例 #3
0
 def __init__(self):
     self.mysql = MySQLConnector()
     self.error = ErrorHandling()
     self.execute_one_col = ExecuteGetListOneColumn()
     self.execute = Execute()
コード例 #4
0
 def __init__(self):
     self.execute_one = ExecuteGetListOneColumn()
     self.execute = ExecuteGetList()
コード例 #5
0
class DensifyTable(object):
    def __init__(self):
        self.execute_one = ExecuteGetListOneColumn()
        self.execute = ExecuteGetList()

    def table_to_dense_csv(self, db, table, row_column, col_column, value_column, output_csv_filename=None,
                           compress_csv=True):
        """
        Take a mysql table to convert a long table (e.g. feature table) to a dense 2x2 contingency matrix 
        (size N by M where N is the number of distinct rows and M is the number of distinct columns). 
        Efficient (uses lookups instead of a single iteration through all entries of the contingency matrix -- 
        could be more slightly more efficient if it used the dbCursor pointer).
        
        Arguments:
        - `db`: database to pull from
        - `table`: table to pull from
        - `row_column`: `table` column that will populate the rows of the contingency csv
        - `col_column`: `table` column that will populate the columns of the contingency csv
        - `value_column`: `table` column that will populate the values at the intersection of the rows and columns 
            of the contingency csv
        - `output_csv_filename`: the name of the output file -- if empty is created based on the values provided
        - `compress_csv`: whether to gzip the csv
        
        """

        if not output_csv_filename:
            output_csv_filename = 'dense.{db}.{table}.{row}-by-{col}.{value}.csv'.format(db=db, table=table,
                                                                                         row=row_column, col=col_column,
                                                                                         value=value_column)

        sorted_row_values = list(self.execute_one.execute_get_list_one_col(db, 'SELECT DISTINCT {row} FROM {table} '
                                                                               'ORDER BY {row}'.format(row=row_column,
                                                                                                       table=table)))
        sorted_col_values = list(
            self.execute_one.execute_get_list_one_col(db, 'SELECT DISTINCT {col} FROM {table} ORDER BY {col}'.format(
                col=col_column, table=table)))

        sorted_values = self.execute.execute_get_list(db,
                                                      'SELECT {row}, {col}, {value} FROM {table} ORDER BY {row}, {col}'.format(
                                                          row=row_column, col=col_column, value=value_column,
                                                          table=table))

        N = len(sorted_row_values)
        M = len(sorted_col_values)

        with open(output_csv_filename, 'wb') as output_csv:
            csv_writer = csv.writer(output_csv)
            csv_writer.writerow([row_column] + sorted_col_values)

            current_row = sorted_values[0][0]
            current_column_data = [None] * M
            num_row_writes = 0

            for tablerow, tablecol, tablevalue in sorted_values:
                # if a new row, write our current column data
                # and reset local variables
                if current_row != tablerow:
                    print "adding %s" % str(current_row)
                    csv_writer.writerow([current_row] + current_column_data)
                    current_column_data = [None] * M
                    num_row_writes += 1
                    current_row = tablerow
                    print '{n} out of {N} rows complete'.format(n=num_row_writes, N=N)
                column_index = sorted_col_values.index(tablecol)
                current_column_data[column_index] = tablevalue

            csv_writer.writerow([current_row] + current_column_data)
            print "adding %s" % str(current_row)
            print "wrote %d features over %d rows" % (N, num_row_writes)

        if compress_csv:
            # os.system("gzip {output_filename}".format(output_filename=output_csv_filename)
            pass

    def main(self):
        parser = argparse.ArgumentParser(description='Create a dense csv given a db, table, and three columns.')
        parser.add_argument('-d', '--db', dest='db', type=str, default='',
                            help='database')
        parser.add_argument('-t', '--table', dest='table', type=str,
                            help='the table to make dense')
        parser.add_argument('-r', '--row', dest='row', type=str,
                            help='the row to use from table')
        parser.add_argument('-c', '--col', dest='col', type=str,
                            help='the col to use from table')
        parser.add_argument('-v', '--value', dest='value', type=str,
                            help='the value to use from table')
        parser.add_argument('-f', '--csv_filename', dest='csvfilename', type=str, default='',
                            help='optional, the output filename.')
        args = parser.parse_args()

        if not args.db or not args.table or not args.row or not args.col or not args.value:
            parser.error('Must specify db, table, row, col, and value to execute.')

        self.table_to_dense_csv(args.db, args.table, args.row, args.col, args.value, args.csvfilename)

        # this is an example use case
        # table_to_dense_csv('twitterGH', 'z_featureExport', 'group_id', 'feat', 'group_norm', 'test.csv')

        return 0

    if __name__ == '__main__':
        sys.exit(main())
コード例 #6
0
 def __init__(self):
     self.mysql = MySQLConnector()
     self.execute = ExecuteGetListOneColumn()