Exemple #1
0
def ingest(record_type, **kwargs):
    """
    Run the ingestion flow for the given :record_type:.
    """
    datasource = acquire_data(record_type, **kwargs)

    validating = not kwargs.get("no_validate")
    if validating:
        ref = kwargs.get("ref")
        datasource = validate_data(datasource, record_type, ref)

        # clean up missing data
        for k in [k for k in datasource.keys()]:
            if not datasource[k] or len(datasource[k]) < 1:
                del datasource[k]
    else:
        print("Skipping data validation")

    # output to files if needed
    output = kwargs.get("output")
    if output and os.path.exists(output):
        print(f"Writing data files to {output}")
        start_time, end_time = kwargs.get("start_time"), kwargs.get("end_time")
        output_data(output, datasource, record_type, start_time, end_time)

    loading = not kwargs.get("no_load")
    if loading and len(datasource) > 0:
        load_data(datasource, record_type, **kwargs)

    print(f"{record_type} complete")
Exemple #2
0
def prepare_data():
    df = acquire_data()
    df = drop_useless_columns(df, .3)
    df = drop_duplicated_observation(df)
    df = drop_ineffecitve_columns(df)
    df = drop_null(df)
    df = create_new_features(df)
    df = creat_dummy_var(df)
    df = regassign_dtypes(df)
    df = drop_outliers(df)
    return df
def prepare_data():
    '''Main function used to quickly run all above functions'''
    df = acquire_data()
    df = drop_useless_columns(df, .3)
    df = drop_duplicated_observation(df)
    df = drop_ineffecitve_columns(df)
    df = drop_null(df)
    df = create_new_features(df)
    df = creat_dummy_var(df)
    df = regassign_dtypes(df)
    df = drop_outliers(df)
    return df
def prep_data():
    df = acquire_data()
    # drop unnecessary columns
    # total_charges are highly correlated with montly_charges, so it is redundant
    df = df.drop(columns = ['customer_id', 'gender','total_charges'])
    # drop tenure less than a month
    df = df[df.tenure != 0]
    # encode all the collumns with 'yes' and 'no' answers. 
    # those with no phone or no internet serive are all counted as no
    for i in df.drop(columns = ['tenure', 
                               'senior_citizen',
                               'monthly_charges', 
                               'internet_service_type_id',
                               'contract_type_id',
                               'payment_type_id'
                              ]).columns:
        df[i] = df[i].apply(encoder)
    return df
Exemple #5
0
def prepare_data():
    print("Getting data...")
    pokemon = acquire_data()
    print("Data Acquired.")
    print("Preparing Data...")

    #Remove unnecessary columns
    pokemon = pokemon.drop(columns=[
        'abilities', 'against_bug', 'against_dark', 'against_dragon',
        'against_electric', 'against_fairy', 'against_fight', 'against_fire',
        'against_flying', 'against_ghost', 'against_grass', 'against_ground',
        'against_ice', 'against_normal', 'against_poison', 'against_psychic',
        'against_rock', 'against_steel', 'against_water', 'classfication',
        'japanese_name'
    ])

    # Fix a value
    pokemon.at[773, 'capture_rate'] = '30'

    # Fix type of capture rate
    pokemon.capture_rate = pokemon.capture_rate.astype(int)

    print("Finished preparing.")
    return pokemon

# - Explore the pandas DataFrame.diff() function. Create a new column that is the result of the current sales - the previous days sales.
def add_sales_difference(df):
    df = df.copy()
    df['diff_from_last_day'] = df.sale_total.diff()
    return df


# - Write a function to set the index to be the datetime variable.
#     - done in the return of parse_date
# def prep_store_data(df):
#     df = df.copy()
#     df = parse_sales_date(df)
#     df = add_date_parts(df)
#     df = improve_sales_data(df)
#     df = add_sales_difference(df)
#     return(df)


def prep_store_data(df):
    df = df.copy()
    df = parse_sales_date(df)
    df = improve_sales_data(df)
    return df


if __name__ == '__main__':
    df = acquire_data()
    df = prep_store_data(df)
    df.head(3)