class RandomSubset(object): def __init__(self): self.mysql = MySQLConnector() self.error = ErrorHandling() self.execute_one_col = ExecuteGetListOneColumn() self.execute = Execute() def random_subset_table(self, db, source_table_name, destination_table_name, field, percent=.10, distinct=True): """ This method selects a random sample of records from source table to destination table :param db: :param source_table_name: :param destination_table_name: :param field: :param percent: :param distinct: """ self.error.warn("making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s..." % ( destination_table_name, percent, source_table_name, field)) self.error.warn("connecting to DATABASE %s..." % db) (db_conn, db_cursor, dict_cursor) = self.mysql.db_connect(db) self.error.warn("removing destination table if it exists...") sql = 'DROP TABLE IF EXISTS %s' % destination_table_name self.execute.execute_(db, db_cursor, sql, True) self.error.warn("cloning structure of table...") sql = 'CREATE TABLE %s LIKE %s' % (destination_table_name, source_table_name) self.execute.execute_(db, db_cursor, sql, True) is_distinct_text = ' distinct' if distinct else '' self.error.warn('grabbing a%s subset (%2.6f percent) of the keys on which to base the new table' % ( is_distinct_text, 100 * percent)) sql = 'SELECT DISTINCT(%s) FROM %s' % (field, source_table_name) if distinct else 'SELECT %s FROM %s' % ( field, source_table_name) unique_key_list = self.execute_one_col.execute_get_list_one_col(db, db_cursor, sql, True) self.error.warn(str(unique_key_list[1:5])) new_keys = sample(unique_key_list, int(floor(len(unique_key_list) * percent))) new_keys = map(str, new_keys) self.error.warn("populating newly created table, TABLE %s" % destination_table_name) populate_query = "INSERT INTO %s SELECT * FROM %s WHERE %s IN (%s)" % ( destination_table_name, source_table_name, field, ','.join(new_keys)) self.execute.execute_(db, db_cursor, populate_query, False) self.error.warn("finished making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s!" % ( destination_table_name, percent, source_table_name, field))
class QuickExecuteGetListOneCol(object): def __init__(self): self.mysql = MySQLConnector() self.execute = ExecuteGetListOneColumn() def quick_execute_get_list_one_col(self, db, sql, warn_query=False): """ performs the db connect and execute in the same call, equivalent to execute_get_list1 :param db: :param sql: :param warn_query: """ (db_conn, db_cursor, dict_cursor) = self.mysql.db_connect(db) return self.execute.execute_get_list_one_col(db, db_cursor, sql, warn_query)
def __init__(self): self.mysql = MySQLConnector() self.error = ErrorHandling() self.execute_one_col = ExecuteGetListOneColumn() self.execute = Execute()
def __init__(self): self.execute_one = ExecuteGetListOneColumn() self.execute = ExecuteGetList()
class DensifyTable(object): def __init__(self): self.execute_one = ExecuteGetListOneColumn() self.execute = ExecuteGetList() def table_to_dense_csv(self, db, table, row_column, col_column, value_column, output_csv_filename=None, compress_csv=True): """ Take a mysql table to convert a long table (e.g. feature table) to a dense 2x2 contingency matrix (size N by M where N is the number of distinct rows and M is the number of distinct columns). Efficient (uses lookups instead of a single iteration through all entries of the contingency matrix -- could be more slightly more efficient if it used the dbCursor pointer). Arguments: - `db`: database to pull from - `table`: table to pull from - `row_column`: `table` column that will populate the rows of the contingency csv - `col_column`: `table` column that will populate the columns of the contingency csv - `value_column`: `table` column that will populate the values at the intersection of the rows and columns of the contingency csv - `output_csv_filename`: the name of the output file -- if empty is created based on the values provided - `compress_csv`: whether to gzip the csv """ if not output_csv_filename: output_csv_filename = 'dense.{db}.{table}.{row}-by-{col}.{value}.csv'.format(db=db, table=table, row=row_column, col=col_column, value=value_column) sorted_row_values = list(self.execute_one.execute_get_list_one_col(db, 'SELECT DISTINCT {row} FROM {table} ' 'ORDER BY {row}'.format(row=row_column, table=table))) sorted_col_values = list( self.execute_one.execute_get_list_one_col(db, 'SELECT DISTINCT {col} FROM {table} ORDER BY {col}'.format( col=col_column, table=table))) sorted_values = self.execute.execute_get_list(db, 'SELECT {row}, {col}, {value} FROM {table} ORDER BY {row}, {col}'.format( row=row_column, col=col_column, value=value_column, table=table)) N = len(sorted_row_values) M = len(sorted_col_values) with open(output_csv_filename, 'wb') as output_csv: csv_writer = csv.writer(output_csv) csv_writer.writerow([row_column] + sorted_col_values) current_row = sorted_values[0][0] current_column_data = [None] * M num_row_writes = 0 for tablerow, tablecol, tablevalue in sorted_values: # if a new row, write our current column data # and reset local variables if current_row != tablerow: print "adding %s" % str(current_row) csv_writer.writerow([current_row] + current_column_data) current_column_data = [None] * M num_row_writes += 1 current_row = tablerow print '{n} out of {N} rows complete'.format(n=num_row_writes, N=N) column_index = sorted_col_values.index(tablecol) current_column_data[column_index] = tablevalue csv_writer.writerow([current_row] + current_column_data) print "adding %s" % str(current_row) print "wrote %d features over %d rows" % (N, num_row_writes) if compress_csv: # os.system("gzip {output_filename}".format(output_filename=output_csv_filename) pass def main(self): parser = argparse.ArgumentParser(description='Create a dense csv given a db, table, and three columns.') parser.add_argument('-d', '--db', dest='db', type=str, default='', help='database') parser.add_argument('-t', '--table', dest='table', type=str, help='the table to make dense') parser.add_argument('-r', '--row', dest='row', type=str, help='the row to use from table') parser.add_argument('-c', '--col', dest='col', type=str, help='the col to use from table') parser.add_argument('-v', '--value', dest='value', type=str, help='the value to use from table') parser.add_argument('-f', '--csv_filename', dest='csvfilename', type=str, default='', help='optional, the output filename.') args = parser.parse_args() if not args.db or not args.table or not args.row or not args.col or not args.value: parser.error('Must specify db, table, row, col, and value to execute.') self.table_to_dense_csv(args.db, args.table, args.row, args.col, args.value, args.csvfilename) # this is an example use case # table_to_dense_csv('twitterGH', 'z_featureExport', 'group_id', 'feat', 'group_norm', 'test.csv') return 0 if __name__ == '__main__': sys.exit(main())
def __init__(self): self.mysql = MySQLConnector() self.execute = ExecuteGetListOneColumn()