"dim_date": dim_date, "dim_weather": dim_weather, "dim_location": dim_location, "dim_crime": dim_crime, } return star_tables if __name__ == "__main__": folder_path = "C:\\Users\\SSrih\\OneDrive\\UChicago\\DEP\\Project\\data" \ "\\Crime and Weather\\" # data_file_name = "CrimeWeather2010.csv" # data_file_name = "Crime2010Raw.csv" data_file_name = "Crime20161718.csv" data_file_path = os.path.join(folder_path, data_file_name) data_extractor = DataExtractor() data_frame = data_extractor.read_csv(fpath=data_file_path, nrows_to_read=5000) # print(data_frame.head()) data_worker = DataWorker() print(data_frame.isnull().sum().sum()) data_worker.process_pipeline(data_frame) print(data_frame.isnull().sum().sum())
def main(): total_start_time = time.time() # ------------------------------------------------------------------------ # # 0. PARSE INPUT ARGUMENTS # ------------------------------------------------------------------------ # data_file_name = "Crime_Weather_Cleaned_2017.csv" # data_file_name = "Crime20161718.csv" data_file_path = os.path.join(FOLDER_PATH, data_file_name) # ------------------------------------------------------------------------ # # 1. ESTABLISH DATABASE CONNECTION # ------------------------------------------------------------------------ # print("\n\n\t\t **** 1. DATABASE CONNECTION **** ") # host = 'localhost' # database = 'crime_star' # user = '******' # password = '******' # port = '3306' port = '3306' data_loader = DataLoader() ret = data_loader.connect(host=DB_IP, database=DB, user=DB_UNAME, password=DB_PWD, port=port) if ret != 1: print(" Connection not established. Try again") print(" Check internet connectivity") return ret # ------------------------------------------------------------------------ # # 2. DATA EXTRACTION PHASE # ------------------------------------------------------------------------ # print("\n\n\t\t **** 2. DATA EXTRACTION **** ") data_extractor = DataExtractor() data_frame = data_extractor.read_csv(fpath=data_file_path, nrows_to_read=-1) # ------------------------------------------------------------------------ # # 3. DATA LOADING PHASE # ------------------------------------------------------------------------ # print("\n\n\t\t **** 2. DATA LOADING **** ") ret = data_loader.load_full_table(data_frame, table_name=RAW_TABLE_NAME) if ret == -1: print(" Could not upload to database ") data_loader.disconnect() return print("Successfully populated database") # ------------------------------------------------------------------------ # # 4. DISCONNECT THE DATABASE AND CLEAN UP MEMORY # ------------------------------------------------------------------------ # data_loader.disconnect() # ------------------------------------------------------------------------ # # 5. SEND A MESSAGE TO THE DATA HUB AS AN UPDATE # ------------------------------------------------------------------------ # print(" Sending message to data hub for update....", end="") messenger = Messenger() # Connect to the data hub messenger.connect(host=DATA_HUB_IP, uname=DATA_HUB_UNAME, pwd=DATA_HUB_PWD) # Connect to the exchange messenger.connect_to_exchange(ex_name=EX_NAME) # Send update message = "Database updated with latest rows" messenger.send_message_to_exchange(ex_name=EX_NAME, message=message, topic=TOPIC) print("sent") total_end_time = time.time() print(" Total time taken :", total_end_time - total_start_time)