def ingest(record_type, **kwargs): """ Run the ingestion flow for the given :record_type:. """ datasource = acquire_data(record_type, **kwargs) validating = not kwargs.get("no_validate") if validating: ref = kwargs.get("ref") datasource = validate_data(datasource, record_type, ref) # clean up missing data for k in [k for k in datasource.keys()]: if not datasource[k] or len(datasource[k]) < 1: del datasource[k] else: print("Skipping data validation") # output to files if needed output = kwargs.get("output") if output and os.path.exists(output): print(f"Writing data files to {output}") start_time, end_time = kwargs.get("start_time"), kwargs.get("end_time") output_data(output, datasource, record_type, start_time, end_time) loading = not kwargs.get("no_load") if loading and len(datasource) > 0: load_data(datasource, record_type, **kwargs) print(f"{record_type} complete")
def prepare_data(): df = acquire_data() df = drop_useless_columns(df, .3) df = drop_duplicated_observation(df) df = drop_ineffecitve_columns(df) df = drop_null(df) df = create_new_features(df) df = creat_dummy_var(df) df = regassign_dtypes(df) df = drop_outliers(df) return df
def prepare_data(): '''Main function used to quickly run all above functions''' df = acquire_data() df = drop_useless_columns(df, .3) df = drop_duplicated_observation(df) df = drop_ineffecitve_columns(df) df = drop_null(df) df = create_new_features(df) df = creat_dummy_var(df) df = regassign_dtypes(df) df = drop_outliers(df) return df
def prep_data(): df = acquire_data() # drop unnecessary columns # total_charges are highly correlated with montly_charges, so it is redundant df = df.drop(columns = ['customer_id', 'gender','total_charges']) # drop tenure less than a month df = df[df.tenure != 0] # encode all the collumns with 'yes' and 'no' answers. # those with no phone or no internet serive are all counted as no for i in df.drop(columns = ['tenure', 'senior_citizen', 'monthly_charges', 'internet_service_type_id', 'contract_type_id', 'payment_type_id' ]).columns: df[i] = df[i].apply(encoder) return df
def prepare_data(): print("Getting data...") pokemon = acquire_data() print("Data Acquired.") print("Preparing Data...") #Remove unnecessary columns pokemon = pokemon.drop(columns=[ 'abilities', 'against_bug', 'against_dark', 'against_dragon', 'against_electric', 'against_fairy', 'against_fight', 'against_fire', 'against_flying', 'against_ghost', 'against_grass', 'against_ground', 'against_ice', 'against_normal', 'against_poison', 'against_psychic', 'against_rock', 'against_steel', 'against_water', 'classfication', 'japanese_name' ]) # Fix a value pokemon.at[773, 'capture_rate'] = '30' # Fix type of capture rate pokemon.capture_rate = pokemon.capture_rate.astype(int) print("Finished preparing.") return pokemon
# - Explore the pandas DataFrame.diff() function. Create a new column that is the result of the current sales - the previous days sales. def add_sales_difference(df): df = df.copy() df['diff_from_last_day'] = df.sale_total.diff() return df # - Write a function to set the index to be the datetime variable. # - done in the return of parse_date # def prep_store_data(df): # df = df.copy() # df = parse_sales_date(df) # df = add_date_parts(df) # df = improve_sales_data(df) # df = add_sales_difference(df) # return(df) def prep_store_data(df): df = df.copy() df = parse_sales_date(df) df = improve_sales_data(df) return df if __name__ == '__main__': df = acquire_data() df = prep_store_data(df) df.head(3)