def load_order(conn, filename): # this function writes to the `account_order` table # "order_id";"account_id";"bank_to";"account_to";"amount";"k_symbol" """ CREATE TABLE account_order ( order_id number NOT NULL, account_id number NOT NULL, bank_to char(2) NOT NULL, account_to number NOT NULL, amount decimal(10,2) NOT NULL, category char(3) NOT NULL, PRIMARY KEY (order_id), FOREIGN KEY(account_id) REFERENCES account (account_id), FOREIGN KEY(account_to) REFERENCES account (account_id) ); """ df = pd.read_csv(filename, sep=';', low_memory=False, nrows=NROWS) cols = {'k_symbol': 'category'} df.rename(columns=cols, inplace=True) log.info("account_order columns: {}".format(df.columns)) # Apply english appreviations # 'POJISTNE' - INS => stands for Insurance Payment # 'SIPO' - HSE => stands for Household Payment # 'LEASING' - LSE => stands for Leasing Payment # 'UVER' - LOA => stands for Loan Payment # ' ' - UNK => stands for Unknown df['category'].replace({'POJISTNE': 'INS'}, inplace=True) df['category'].replace({'SIPO': 'HSE'}, inplace=True) df['category'].replace({'LEASING': 'LSN'}, inplace=True) df['category'].replace({'UVER': 'LOA'}, inplace=True) df['category'].replace({' ': 'UNK'}, inplace=True) pprint(df) log.info('Starting data import for: {} ({} rows)'.format( filename, len(df))) # dtype : dict of column name to SQL type, default None dtype = { 'order_id': sqlt.Integer, 'account_id': sqlt.Integer, 'bank_to': sqlt.NCHAR(2), 'account_to': sqlt.Integer, 'amount': sqlt.Numeric, 'category': sqlt.NCHAR(2), } df.to_sql('account_order', con=conn, if_exists='append', index=False, dtype=dtype) log.info('Finished data import for: {}'.format(filename))
def test_reflect_nvarchar(self): metadata = self.metadata Table( "tnv", metadata, Column("nv_data", sqltypes.NVARCHAR(255)), Column("c_data", sqltypes.NCHAR(20)), ) metadata.create_all() m2 = MetaData(testing.db) t2 = Table("tnv", m2, autoload=True) assert isinstance(t2.c.nv_data.type, sqltypes.NVARCHAR) assert isinstance(t2.c.c_data.type, sqltypes.NCHAR) if testing.against("oracle+cx_oracle"): assert isinstance( t2.c.nv_data.type.dialect_impl(testing.db.dialect), cx_oracle._OracleUnicodeStringNCHAR, ) assert isinstance( t2.c.c_data.type.dialect_impl(testing.db.dialect), cx_oracle._OracleNChar, ) data = u("m’a réveillé.") with testing.db.connect() as conn: conn.execute(t2.insert(), dict(nv_data=data, c_data=data)) nv_data, c_data = conn.execute(t2.select()).first() eq_(nv_data, data) eq_(c_data, data + (" " * 7)) # char is space padded assert isinstance(nv_data, util.text_type) assert isinstance(c_data, util.text_type)
def test_reflect_nvarchar(self, metadata, connection): Table( "tnv", metadata, Column("nv_data", sqltypes.NVARCHAR(255)), Column("c_data", sqltypes.NCHAR(20)), ) metadata.create_all(connection) m2 = MetaData() t2 = Table("tnv", m2, autoload_with=connection) assert isinstance(t2.c.nv_data.type, sqltypes.NVARCHAR) assert isinstance(t2.c.c_data.type, sqltypes.NCHAR) if testing.against("oracle+cx_oracle"): assert isinstance( t2.c.nv_data.type.dialect_impl(connection.dialect), cx_oracle._OracleUnicodeStringNCHAR, ) assert isinstance( t2.c.c_data.type.dialect_impl(connection.dialect), cx_oracle._OracleNChar, ) data = "m’a réveillé." connection.execute(t2.insert(), dict(nv_data=data, c_data=data)) nv_data, c_data = connection.execute(t2.select()).first() eq_(nv_data, data) eq_(c_data, data + (" " * 7)) # char is space padded assert isinstance(nv_data, str) assert isinstance(c_data, str)
def load_account(conn, filename): df = pd.read_csv(filename, sep=';', low_memory=False, nrows=NROWS) df['date'] = pd.to_datetime(df['date'], format="%y%m%d") df.rename(columns={"date": "created_date"}, inplace=True) df.sort_values(by=['account_id'], inplace=True) log.info("columns: {}".format(df.columns)) """ Replace `frequency` strings with abbreviations - POPLATEK MESICNE changed to MONTHLY ISSUANCE (MO) - POPLATEK TYDNE changed to WEEKLY ISSUANCE (WE) - POPLATEK PO OBRATU change to ISSUANCE AFTER TRANSACTION (AT) """ df['frequency'].replace({'POPLATEK MESICNE': 'MO'}, inplace=True) df['frequency'].replace({'POPLATEK TYDNE': 'WE'}, inplace=True) df['frequency'].replace({'POPLATEK PO OBRATU': 'AT'}, inplace=True) log.info('Starting data import for: {} ({} rows)'.format( filename, len(df))) """ CREATE TABLE account ( account_id number NOT NULL, district_id number NOT NULL, frequency char(2) NOT NULL, created_date date NOT NULL, PRIMARY KEY (account_id), FOREIGN KEY(district_id) REFERENCES district (district_id) INITIALLY DEFERRED DEFERRABLE ) ; CREATE INDEX created_date ON account(created_date); """ dtype = { 'account_id': sqlt.Integer, 'district_id': sqlt.Integer, 'frequency': sqlt.NCHAR(2), 'created_date': sqlt.Date, } df.to_sql('account', con=conn, if_exists='append', index=False, dtype=dtype) # df.to_sql('account', con=conn, if_exists='append', index=False) log.info('Finished data import for: {}'.format(filename)) data = conn.execute("SELECT * FROM account WHERE ROWNUM <= 10").fetchall() pprint(data)
def load_loan(conn, filename): """ CREATE TABLE loan ( loan_id number NOT NULL, account_id number NOT NULL, loan_date date NOT NULL, amount decimal(10, 2) NOT NULL, duration number, payments decimal(10, 2) NOT NULL, status char(1) NOT NULL, PRIMARY KEY (loan_id), FOREIGN KEY(account_id) REFERENCES account (account_id) ); """ df = pd.read_csv(filename, sep=';', low_memory=False, nrows=NROWS) print("== loan df size: {}".format(len(df))) cols = { 'date': 'loan_date', } df.rename(columns=cols, inplace=True) log.info("{} columns: {}".format(filename, df.columns)) df['loan_date'] = pd.to_datetime(df['loan_date'], format="%y%m%d") dtype = { 'loan_id': sqlt.Integer, 'account_id': sqlt.Integer, 'loan_date': sqlt.Date, 'amount': sqlt.Numeric(10, 2), 'duration': sqlt.Integer, 'payments': sqlt.Numeric(10, 2), 'status': sqlt.NCHAR(1), } log.info('Starting data import for: {} ({} rows)'.format( filename, len(df))) df.to_sql('loan', con=conn, if_exists='append', index=False, dtype=dtype) log.info('Finished data import for: {}'.format(filename))
def load_transaction(conn, filename): """ "trans_id";"account_id";"date";"type";"operation";"amount";"balance";"k_symbol";"bank";"account" CREATE TABLE transaction ( trans_id number NOT NULL, account_id number NOT NULL, created_date date NOT NULL, trans_type char(2) NOT NULL, -- CR+=credit, DB=-debit operation char(3) NOT NULL, amount decimal(10,2) NOT NULL, balance decimal(10,2) NOT NULL, category char(3) NOT NULL, bank_to char(2), account_to number, PRIMARY KEY (trans_id), FOREIGN KEY(account_id) REFERENCES account (account_id) ); """ # df = pd.read_csv(filename, sep=';', low_memory=False, nrows=NROWS) df = pd.read_csv(filename, sep=';', low_memory=False) print("== df size: {}".format(len(df))) cols = { 'date': 'created_date', 'type': 'trans_type', 'k_symbol': 'category', 'bank': 'bank_to', 'account': 'account_to', } df.rename(columns=cols, inplace=True) log.info("{} columns: {}".format(filename, df.columns)) df['created_date'] = pd.to_datetime(df['created_date'], format="%y%m%d") df['trans_type'].replace({'PRIJEM': 'CR'}, inplace=True) # credit df['trans_type'].replace({'VYDAJ': 'DB'}, inplace=True) # debit df['trans_type'].replace({'VYBER': 'DB'}, inplace=True) # debit # operation # PREVOD NA UCET 208283 - remittance to another bank (REM) # PREVOD Z UCTU 65226 - collection from another bank (COL) # VKLAD 156743 - credit in cash (CRE) # VYBER 434918 - withdrawal in cash (WCA) # VYBER KARTOU 8036 - credit card withdrawal (WCC) df['operation'].replace({'PREVOD NA UCET': 'REM'}, inplace=True) df['operation'].replace({'PREVOD Z UCTU': 'COL'}, inplace=True) df['operation'].replace({'VKLAD': 'CRE'}, inplace=True) df['operation'].replace({'VYBER': 'WCA'}, inplace=True) df['operation'].replace({'VYBER KARTOU': 'WCC'}, inplace=True) # category # DUCHOD 30338 - old-age pension # POJISTNE 18500 - insurance payment # SANKC. UROK 1577 - sanction interest if negative balance # SIPO 118065 - household # SLUZBY 155832 - payment for statement # UROK 183114 - interest credited # UVER 13580 - loan payment df['category'].replace({'DUCHOD': 'PEN'}, inplace=True) # pension df['category'].replace({'POJISTNE': 'INS'}, inplace=True) # insurance df['category'].replace({'SANKC. UROK': 'INB'}, inplace=True) # interest negative balance df['category'].replace({'SIPO': 'HSE'}, inplace=True) # household df['category'].replace({'SLUZBY': 'PST'}, inplace=True) # payment for statement df['category'].replace({'UROK': 'INC'}, inplace=True) # interest credited df['category'].replace({'UVER': 'LOA'}, inplace=True) # loan payment # print("operation: {}".format(df.groupby('operation').size())) # print("category: {}".format(df.groupby('category').size())) df['category'] = df['category'].replace(np.nan, 'UNK') df['category'] = df['category'].replace(' ', 'UNK') df['category'] = df['category'].replace('', 'UNK') df['operation'] = df['operation'].replace(np.nan, 'UNK') df['operation'] = df['operation'].replace(' ', 'UNK') df['operation'] = df['operation'].replace('', 'UNK') print("trans_type: {}".format(df.groupby('trans_type').size())) print("operation: {}".format(df.groupby('operation').size())) print("category: {}".format(df.groupby('category').size())) dtype = { 'trans_id': sqlt.Integer, 'account_id': sqlt.Integer, 'create_date': sqlt.Date, 'trans_type': sqlt.NCHAR(2), 'operation': sqlt.NCHAR(3), 'amount': sqlt.Numeric(10, 2), 'balance': sqlt.Numeric(10, 2), 'category': sqlt.NCHAR(3), 'bank_to': sqlt.NCHAR(2), 'account_to': sqlt.Integer, } log.info('Starting data import for: {} ({} rows)'.format( filename, len(df))) # df2 = df.iloc[269000:270000] # print("== df2 size: {}".format(len(df2))) # df2.to_sql('transaction2', con=conn, if_exists='append', index=False, dtype=dtype, chunksize=200) df.to_sql('transaction', con=conn, if_exists='append', index=False, dtype=dtype, chunksize=200) log.info('Finished data import for: {}'.format(filename))