def get_data(self, q=None, nrow=0, table=True, **kwargs): ''' note q can be a table name Ref: http://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql Options: **kargs can have the pandas options parse_dates=['Date'] - explicitly force columns to be parsed as dates parse_dates={'Date': '%Y-%m-%d'} index_col='id' - the name of the column as the DataFrame index columns=['Col_1', 'Col_2'] - specify a subset of columns to be read ''' if q is None: if self.cursorisopen: self.get_data_from_cursor(nrow=nrow) else: print('WARNING: Cursor is not open and no query is passed.') res = [] else: #get data by doing database query if table: res = psql.read_sql(q, self.engine, **kwargs) else: res = psql.read_sql_table(q, self.engine, **kwargs) #cur = self.get_cursor(q) #res = self.get_data_from_cursor(cur=cur, nrow=nrow) #cur.close() return res
def test_default_date_load(self): df = sql.read_sql_table("types_test_data", self.conn) # IMPORTANT - sqlite has no native date type, so shouldn't parse, but # MySQL SHOULD be converted. self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type")
def test_read_table_columns(self): # test columns argument in read_table sql.to_sql(self.test_frame1, 'test_frame', self.conn) cols = ['A', 'B'] result = sql.read_sql_table('test_frame', self.conn, columns=cols) self.assertEqual(result.columns.tolist(), cols, "Columns not correctly selected")
def test_date_parsing(self): # No Parsing df = sql.read_sql_table("types_test_data", self.conn) df = sql.read_sql_table("types_test_data", self.conn, parse_dates=['DateCol']) self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type") df = sql.read_sql_table("types_test_data", self.conn, parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type") df = sql.read_sql_table("types_test_data", self.conn, parse_dates={ 'DateCol': {'format': '%Y-%m-%d %H:%M:%S'}}) self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") df = sql.read_sql_table( "types_test_data", self.conn, parse_dates=['IntDateCol']) self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") df = sql.read_sql_table( "types_test_data", self.conn, parse_dates={'IntDateCol': 's'}) self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") df = sql.read_sql_table( "types_test_data", self.conn, parse_dates={'IntDateCol': {'unit': 's'}}) self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type")
def test_read_table_index_col(self): # test columns argument in read_table sql.to_sql(self.test_frame1, 'test_frame', self.conn) result = sql.read_sql_table('test_frame', self.conn, index_col="index") self.assertEqual(result.index.names, ["index"], "index_col not correctly set") result = sql.read_sql_table('test_frame', self.conn, index_col=["A", "B"]) self.assertEqual(result.index.names, ["A", "B"], "index_col not correctly set") result = sql.read_sql_table('test_frame', self.conn, index_col=["A", "B"], columns=["C", "D"]) self.assertEqual(result.index.names, ["A", "B"], "index_col not correctly set") self.assertEqual(result.columns.tolist(), ["C", "D"], "columns not set correctly whith index_col")
def test_read_sql_delegate(self): iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) tm.assert_frame_equal( iris_frame1, iris_frame2, "read_sql and read_sql_query have not the same" " result with a query") iris_frame1 = sql.read_sql_table('iris', self.conn) iris_frame2 = sql.read_sql('iris', self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2)
def test_mixed_dtype_insert(self): # see GH6509 s1 = Series(2**25 + 1,dtype=np.int32) s2 = Series(0.0,dtype=np.float32) df = DataFrame({'s1': s1, 's2': s2}) # write and read again df.to_sql("test_read_write", self.conn, index=False) df2 = sql.read_sql_table("test_read_write", self.conn) tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True)
def test_mixed_dtype_insert(self): # see GH6509 s1 = Series(2**25 + 1, dtype=np.int32) s2 = Series(0.0, dtype=np.float32) df = DataFrame({'s1': s1, 's2': s2}) # write and read again df.to_sql("test_read_write", self.conn, index=False) df2 = sql.read_sql_table("test_read_write", self.conn) tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True)
def test_read_sql_delegate(self): iris_frame1 = sql.read_sql_query( "SELECT * FROM iris", self.conn) iris_frame2 = sql.read_sql( "SELECT * FROM iris", self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2, "read_sql and read_sql_query have not the same" " result with a query") iris_frame1 = sql.read_sql_table('iris', self.conn) iris_frame2 = sql.read_sql('iris', self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2)
def test_default_type_conversion(self): df = sql.read_sql_table("types_test_data", self.conn) self.assertTrue(issubclass(df.FloatCol.dtype.type, np.floating), "FloatCol loaded with incorrect type") self.assertTrue(issubclass(df.IntCol.dtype.type, np.integer), "IntCol loaded with incorrect type") self.assertTrue(issubclass(df.BoolCol.dtype.type, np.bool_), "BoolCol loaded with incorrect type") # Int column with NA values stays as float self.assertTrue(issubclass(df.IntColWithNull.dtype.type, np.floating), "IntColWithNull loaded with incorrect type") # Bool column with NA values becomes object self.assertTrue(issubclass(df.BoolColWithNull.dtype.type, np.object), "BoolColWithNull loaded with incorrect type")
def test_default_type_conversion(self): df = sql.read_sql_table("types_test_data", self.conn) self.assertTrue(issubclass(df.FloatCol.dtype.type, np.floating), "FloatCol loaded with incorrect type") self.assertTrue(issubclass(df.IntCol.dtype.type, np.integer), "IntCol loaded with incorrect type") # MySQL has no real BOOL type (it's an alias for TINYINT) self.assertTrue(issubclass(df.BoolCol.dtype.type, np.integer), "BoolCol loaded with incorrect type") # Int column with NA values stays as float self.assertTrue(issubclass(df.IntColWithNull.dtype.type, np.floating), "IntColWithNull loaded with incorrect type") # Bool column with NA = int column with NA values => becomes float self.assertTrue(issubclass(df.BoolColWithNull.dtype.type, np.floating), "BoolColWithNull loaded with incorrect type")
def test_default_type_conversion(self): df = sql.read_sql_table("types_test_data", self.conn) self.assertTrue(issubclass(df.FloatCol.dtype.type, np.floating), "FloatCol loaded with incorrect type") self.assertTrue(issubclass(df.IntCol.dtype.type, np.integer), "IntCol loaded with incorrect type") # sqlite has no boolean type, so integer type is returned self.assertTrue(issubclass(df.BoolCol.dtype.type, np.integer), "BoolCol loaded with incorrect type") # Int column with NA values stays as float self.assertTrue(issubclass(df.IntColWithNull.dtype.type, np.floating), "IntColWithNull loaded with incorrect type") # Non-native Bool column with NA values stays as float self.assertTrue(issubclass(df.BoolColWithNull.dtype.type, np.floating), "BoolColWithNull loaded with incorrect type")
def test_date_parsing(self): # No Parsing df = sql.read_sql_table("types_test_data", self.conn) df = sql.read_sql_table("types_test_data", self.conn, parse_dates=['DateCol']) self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type") df = sql.read_sql_table("types_test_data", self.conn, parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type") df = sql.read_sql_table( "types_test_data", self.conn, parse_dates={'DateCol': { 'format': '%Y-%m-%d %H:%M:%S' }}) self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") df = sql.read_sql_table("types_test_data", self.conn, parse_dates=['IntDateCol']) self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") df = sql.read_sql_table("types_test_data", self.conn, parse_dates={'IntDateCol': 's'}) self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") df = sql.read_sql_table("types_test_data", self.conn, parse_dates={'IntDateCol': { 'unit': 's' }}) self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type")
def migrate_db_to_rds(self, tb_name): src_conn = self.engine_transform_data.connect() dst_conn = self.engine_source.connect() try: df = sqlio.read_sql_table(tb_name, src_conn, schema='public') text_field = self.check_type(tb_name) df.to_sql(tb_name, dst_conn, schema='public', index=False, if_exists='append', method='multi', dtype=text_field) except psycopg2.OperationalError: print('OperationalError!!!!') pass finally: self.engine_transform_data.dispose() self.engine_source.dispose()
def encrypt_sql_database_values(base_db_name='', base_db_user='', base_db_password='', enc_db_name='', enc_db_user='', enc_db_password=''): """ Encryption of database to allow posting and using on AWS server for personal portfolio. This database protects Hyundai's private emission report data, while still allowing demo to visualize similar trends in data. Two types of encryption: - Encrypt text fields by creating a dictionary to randomly shuffle letters/numbers for each value. - Encrypt numbers by selecting some factor based vehicle name. :param base_db_name: name of database to be encrypted :param base_db_user: name of user for database :param base_db_password: password for database user :param enc_db_name: Name of new database containing encrypted values :param enc_db_user: :param enc_db_password: :return: """ # Step 0: Import required libraries import psycopg2 import sqlalchemy import numpy as np import pandas as pd import pandas.io.sql as pdsql from random import uniform as rand_unf # Step 1: Connect to database. connection_str = "{0}:{1}@localhost/{2}".format(base_db_user, base_db_password, base_db_name.replace("\\","")) engine = sqlalchemy.create_engine("postgresql+psycopg2://"+connection_str) # Step 2: Load in data as pandas dataframes to minimize calls to SQL Database bag_em = pdsql.read_sql_table('bag_em', engine) dynamometer = pdsql.read_sql_table('dynamometer', engine) fuel = pdsql.read_sql_table('fuel', engine) fuel_economy = pdsql.read_sql_table('fuel_economy', engine) general = pdsql.read_sql_table('general', engine) pm = pdsql.read_sql_table('pm', engine) test_info = pdsql.read_sql_table('test_info', engine) time_info = pdsql.read_sql_table('time_info', engine) vehicle = pdsql.read_sql_table('vehicle', engine) j2951 = pdsql.read_sql_table('j2951', engine) # Step 3: Create link of test_id to vehicle id_veh_map = {} for test, row in vehicle.iterrows(): id_veh_map[row['_id']] = row['vin'] # Step 4: Loop through bag emissions and create list of factors for each # print bag_em.head() veh_em_offset = {} for key, veh_name in id_veh_map.iteritems(): if veh_name not in veh_em_offset.keys(): veh_em_offset[veh_name] = rand_unf(0.6, 1.4) for col_name in bag_em.columns: if col_name not in ['_id', 'phase', 'emission_type']: bag_em[col_name] = bag_em[col_name] * \ bag_em['_id'].apply(lambda x: veh_em_offset[id_veh_map[x]]) # print bag_em.head() # Step 5: Create new random numbers for dynamometer coefficients # (diff random number for inertia and other coefficients) # print dynamometer.head() veh_dyno_offset = {} for key in veh_em_offset.keys(): veh_dyno_offset[key] = {} veh_dyno_offset[key]['inertia'] = rand_unf(0.6, 1.4) veh_dyno_offset[key]['coef'] = rand_unf(0.7,1.3) for col_name in dynamometer.columns: if col_name == 'inertia': dynamometer[col_name] = dynamometer[col_name] * \ dynamometer['_id'].apply(lambda x: veh_dyno_offset[id_veh_map[x]]['inertia']) elif col_name not in ['_id', 'inertia']: dynamometer[col_name] = dynamometer[col_name] * \ dynamometer['_id'].apply(lambda x: veh_dyno_offset[id_veh_map[x]]['coef']) # print dynamometer.head() # Step 6: Randomize Fuel Numbers (multiply entire table by the same random number # print fuel.head() fuel_name_map = {} i = 1 for fuel_name in fuel['fuel'].unique(): fuel_name_map[fuel_name] = 'Fuel #'+str(i) i+=1 # print fuel_name_map fuel['fuel'] = fuel['fuel'].apply(lambda x: fuel_name_map[x]) for col_name in fuel.columns: if col_name not in ['_id', 'fuel', 'r_f']: fuel_factor = rand_unf(0.7, 1.4) fuel[col_name] = fuel[col_name] * fuel_factor # print fuel.head() # Step 7: Randomize general info: # - modify nmog_ratio, hcho_ratio, and driver/operator/requestor names # - remove test_options, shift_list, event_list, and remarks # print general.head() general['nmog_ratio'] = general['nmog_ratio'] * rand_unf(0.7,1.3) general['hcho_ratio'] = general['hcho_ratio'] * rand_unf(0.7,1.3) driver_map = {} requestor_map = {} i = 1 for driver in sorted(general['driver'].unique()): driver_map[driver] = 'Technician #'+str(i) i+=1 for operator in sorted(general['operator'].unique()): if operator not in driver_map.keys(): driver_map[operator] = 'Technician #' + str(i) i += 1 # print driver_map i = 1 for requestor in general['requestor'].unique(): requestor_map[requestor] = 'Engineer #' + str(i) i += 1 # print requestor_map general['driver'] = general['driver'].apply(lambda x: driver_map[x]) general['operator'] = general['operator'].apply(lambda x: driver_map[x]) general['requestor'] = general['requestor'].apply(lambda x: requestor_map[x]) general['test_description'] = '' general['event_list'] = '' general['remarks_pre'] = '' general['shift_list'] = '' general['remarks_post'] = '' general['test_options'] = '' # print general.head() # Step 8: Randomize pm data (mass_emission and mass_per_distance - select random factor by vehicle) for col_name in pm.columns: if col_name not in ['_id', 'phase']: pm[col_name] = pm[col_name] * \ pm['_id'].apply(lambda x: veh_em_offset[id_veh_map[x]]) # Step 9: Randomize vehicle info (all of it, except for Odometer and tire pressure) vehicle['axle_ratio'] = vehicle['axle_ratio'].apply(lambda x: np.nan) vehicle['curb_weight'] = dynamometer['inertia'] vehicle['engine_catalyst'] = None vehicle['engine_code'] = None engine_desc_map = {} i = 1 for desc in vehicle['engine_description'].unique(): engine_desc_map[desc] = "Engine Type #"+str(i) i += 1 vehicle['engine_description'] = vehicle['engine_description'].apply(lambda x: engine_desc_map[x]) rand_factor = rand_unf(0.7,1.3) vehicle['engine_displacement'] = vehicle['engine_displacement'].apply(lambda x: round(x*rand_factor,1)) vehicle['gross_weight'] = dynamometer['inertia'] vehicle['shift_point'] = None test_group_map = {} i = 1 for group in vehicle['test_group'].unique(): if group == None: test_group_map[group] = None else: test_group_map[group] = "Test Group #" + str(i) i += 1 vehicle['test_group'] = vehicle['test_group'].apply(lambda x: test_group_map[x]) transmission_map = {} i = 1 for trans in vehicle['transmission'].unique(): transmission_map[trans] = "T" + str(i) i+=1 vehicle['transmission'] = vehicle['transmission'].apply(lambda x: transmission_map[x]) vehicle['trim_level'] = None vin_map = {} for vin in vehicle['vin'].unique(): vin_map[vin] = id_generator(size=8) vehicle['vin'] = vehicle['vin'].apply(lambda x: vin_map[x]) veh_map = {} i = 1 for model in vehicle['vehicle_model'].unique(): veh_map[model] = "Vehicle" + str(i) i += 1 vehicle['vehicle_model'] = vehicle['vehicle_model'].apply(lambda x: veh_map[x]) vehicle['tire_size'] = None # Step 10: Randomize j2951 data for col_name in j2951.columns: if col_name not in ['_id', 'cycle', 'cycle_id']: j2951[col_name] = j2951[col_name] * rand_unf(0.7, 1.3) # Step 11: Randomize _id names among all tables for key in id_veh_map.keys(): id_veh_map[key] = id_generator(size=8)+'_'+id_generator(size=8)+'_'+id_generator(size=3) bag_em['_id'] = bag_em['_id'].apply(lambda x: id_veh_map[x]) dynamometer['_id'] = dynamometer['_id'].apply(lambda x: id_veh_map[x]) fuel['_id'] = fuel['_id'].apply(lambda x: id_veh_map[x]) fuel_economy['_id'] = fuel_economy['_id'].apply(lambda x: id_veh_map[x]) general['_id'] = general['_id'].apply(lambda x: id_veh_map[x]) pm['_id'] = pm['_id'].apply(lambda x: id_veh_map[x]) test_info['_id'] = test_info['_id'].apply(lambda x: id_veh_map[x]) time_info['_id'] = time_info['_id'].apply(lambda x: id_veh_map[x]) vehicle['_id'] = vehicle['_id'].apply(lambda x: id_veh_map[x]) j2951['_id'] = j2951['_id'].apply(lambda x: id_veh_map[x]) # Write new tables to new database (enc_db_name/user/password) ## Reset table_names = pdsql.read_sql_query("select table_name from information_schema.tables where table_schema='public'", engine) reset_query = "" for table in table_names.values: reset_query += "DELETE FROM " + table + "; " conn = psycopg2.connect(host="postgres-instance.cm2vo1ykiz5s.us-east-2.rds.amazonaws.com", port=5432, dbname=enc_db_name, user=enc_db_user, password=enc_db_password) cur = conn.cursor() cur.execute(reset_query[0]) conn.commit() cur.close() conn.close() connection_str = "postgresql://{0}:{1}@postgres-instance.cm2vo1ykiz5s.us-east-2.rds.amazonaws.com:5432/{2}".format(enc_db_user, enc_db_password, enc_db_name.replace("\\", "")) engine_aws = sqlalchemy.create_engine(connection_str) print 'general...' general.to_sql('general', engine_aws, if_exists='append', index=False) print 'dynamometer...' dynamometer.to_sql('dynamometer', engine_aws, if_exists='append', index=False) print 'fuel...' fuel.to_sql('fuel', engine_aws, if_exists='append', index=False) print 'fuel_economy...' fuel_economy.to_sql('fuel_economy', engine_aws, if_exists='append', index=False) print 'pm...' pm.to_sql('pm', engine_aws, if_exists='append', index=False) print 'test_info...' test_info.to_sql('test_info', engine_aws, if_exists='append', index=False) print 'time_info...' time_info.to_sql('time_info', engine_aws, if_exists='append', index=False) print 'vehicle...' vehicle.to_sql('vehicle', engine_aws, if_exists='append', index=False) print 'j2951...' j2951.to_sql('j2951', engine_aws, if_exists='append', index=False) print 'bag_em...' bag_em.to_sql('bag_em', engine_aws, if_exists='append', index=False)
import pandas as pd from sqlalchemy import create_engine from pandas.io import sql def to_xml(df, filename=None, mode='w', keys=None): def row_to_xml(row): xml = ['<item>'] for field in row.index: xml.append('<field name="{0}"><![CDATA[{1}]]></field>'.format(field, row[field])) xml.append('</item>') return '\n'.join(xml) res = '<?xml version="1.0" encoding="ISO-8859-1"?>\n'+'<data>\n'+'\n'.join(df.apply(row_to_xml, axis=1))+'\n</data>' if filename is None: return res with open(filename, mode) as f: print res f.write(res) pd.DataFrame.to_xml = to_xml engine = create_engine('mysql://*****:*****@localhost/alterrefront', echo=True) #cnx = engine.connect() #query = "SELECT url, title, description, media, price, period, category, subcategory FROM ads LIMIT 100" #results = cnx.execute(query) #fetchall = results.fetchall() df = sql.read_sql_table("ads", con=engine) dataframe = out = df[df['location'].str.contains('^(.*rouen.*)$')] #cnx.close() keys = ["url", "title", "description", "media", "price", "period","category", "subcategory"] dataframe.to_xml('flux.xml', keys=keys)
def test_read_table(self): iris_frame = sql.read_sql_table("iris", con=self.conn) self._check_iris_loaded_frame(iris_frame)
def test_read_table_columns(self): iris_frame = sql.read_sql_table( "iris", con=self.conn, columns=['SepalLength', 'SepalLength']) tm.equalContents( iris_frame.columns.values, ['SepalLength', 'SepalLength'])
def test_read_table_columns(self): iris_frame = sql.read_sql_table("iris", con=self.conn, columns=['SepalLength', 'SepalLength']) tm.equalContents(iris_frame.columns.values, ['SepalLength', 'SepalLength'])