def csv_load_to_db(destination_folder, filename, insert_query_file, by_rows_batch=10000): """ parse csv file and execute query to load into database. Arg: 1. filename = name of csv file 'filename.csv' 2. destination_folder = downloaded files directory 'data/' 3. insert_query = dir or .sql file ,'path/local/query.sql' """ csv_file = open(destination_folder + filename, 'r') sql_file = open(insert_query_file, 'r') sql = sql_file.read() insert_query = sql.split(';')[1] conn = MySqlHook(mysql_conn_id='mysql_localhost').get_conn() cur = conn.cursor() cur.execute('use sales_records_airflow') cur.execute('select count(*) from sales LIMIT 1') row_count = cur.fetchone( )[0] + 1 # add one because we want to exclude header when slicing csv for loop if row_count is 1: print('empty') for row in islice( csv_file, row_count, row_count + by_rows_batch): # start 1, stop 10000 return 10000 rows val = row.rstrip().split(',') dt1 = datetime.strptime(val[5], '%m/%d/%Y').date() dt2 = datetime.strptime(val[7], '%m/%d/%Y').date() val[5] = dt1 val[7] = dt2 params = val cur.execute(query=insert_query, args=params) conn.commit() elif row_count > 1: print('not empty') for row in islice( csv_file, row_count, row_count + by_rows_batch ): # previous rows add 1 start at 10001, stop at 10001+10000 return 10000 rows end at row 20000 val = row.rstrip().split(',') dt1 = datetime.strptime(val[5], '%m/%d/%Y').date() dt2 = datetime.strptime(val[7], '%m/%d/%Y').date() val[5] = dt1 val[7] = dt2 params = val cur.execute(query=insert_query, args=params) conn.commit() elif row_count == 50001: pass conn.close() csv_file.close()
def check_data(task_instance, create_table_query_file): conn = MySqlHook(mysql_conn_id='mysql_localhost').get_conn() cur = conn.cursor() try: cur.execute('use sales_records_airflow') cur.execute('select count(*) from sales') total_rows = cur.fetchone()[0] task_instance.xcom_push(key='mysql_total_rows', value=total_rows) if type(total_rows) is int: print('appending new data') return 'csv_file_exist' elif total_rows == 50000: print('up to date') return 'check_dataset' except cur.OperationalError: print('sql_file execute') sql_file = open(create_table_query_file, 'r') sql_query = sql_file.read() for query in sql_query.split(';', maxsplit=2): cur.execute('{}'.format(query)) conn.commit() return 'csv_file_not_exist'