Beispiel #1
0
def create_mysql_connection():

    #Recovering DB parameters
    hostname = get_param('DB', 'mysql_hostname')
    db_name = get_param('DB', 'db_name')
    user_name = get_param('DB', 'user_name')
    pwd = get_param('DB', 'pwd')

    #Building connection_string
    db_connection_str = 'mysql+mysqlconnector://' + user_name + ':' + pwd + '@' + hostname + '/' + db_name
    db_connection = sqlalchemy.create_engine(db_connection_str)

    return db_connection
Beispiel #2
0
def read_table_to_df(dim):
    #db_connection = create_mysql_connection
    hostname = get_param('DB', 'mysql_hostname')
    db_name = get_param('DB', 'db_name')
    user_name = get_param('DB', 'user_name')
    pwd = get_param('DB', 'pwd')

    db_connection = mysql.connector.connect(host=hostname,
                                            user=user_name,
                                            password=pwd,
                                            database=db_name)
    query = "select * FROM " + dim
    return pd.read_sql(query, con=db_connection)
Beispiel #3
0
def load_compound_dims(source_data):
    
    dim_file = get_param('DIM','compound')
    compound_dim = get_dictionary(dim_file)

    created_dims = []

    for key in compound_dim:
        i = 1
        for column_name in compound_dim[key]:
            if column_name in source_data:
                if i == 1:
                    df_acum = dim_by_column(source_data, key, column_name, 0)
                else:
                    base_id = len(df_acum.index)
                    df_acum = pd.concat([df_acum, dim_by_column(source_data, key, column_name, base_id)], ignore_index=True)
                    new_column_name = key + '_desc'
                    df_acum = df_acum[new_column_name].drop_duplicates(inplace=False).sort_values().reset_index(drop=True).dropna().to_frame()
                i += 1
            else:
                raise Exception("ERROR: The column "+ column_name + " does not exist - Verify and correct " + dim_file)
        clean_df = df_acum.dropna()
        clean_df['Id'] = clean_df.index + 1
        column_list = ['Id', new_column_name]
        final_df = clean_df[column_list]
        load_table('dim_' + key, final_df, 'replace')
        created_dims.append(key)

    if len(compound_dim) != len(created_dims):
        raise Exception("ERROR: there were some dimensions that couldn't been created - \n \
            Dimension List: "+ ','.join(list(compound_dim.keys()))+ "\nCreated Dimensions" \
                + ','.join(created_dims))
    else:
        print("Compound dims created: " + ', '.join(created_dims))
        return created_dims
Beispiel #4
0
def load_default_dims(source_data):
    
    dim_file = get_param('DIM','default')
    default_dim = get_dictionary(dim_file)

    created_dims = []

    for key in default_dim:
        if default_dim[key] in source_data:
            df = dim_by_column(source_data, key, default_dim[key], 0)
            clean_df = df.dropna()
            try:
                load_table('dim_' + key, clean_df, 'replace')
            except:
                print("An error occurred while loading default dimension " + key)
            created_dims.append(key)
        else:
            raise Exception("ERROR: The column "+ default_dim[key] + " does not exist - Verify and correct " + dim_file)

    if len(default_dim) != len(created_dims):
        raise Exception("ERROR: there were some dimensions that couldn't been created - \n \
            Dimension List: "+ ','.join(list(default_dim.keys()))+ "\nCreated Dimensions" \
                + ','.join(created_dims))
    else:
        print("Default dims created: " + ', '.join(created_dims))
        return created_dims     
Beispiel #5
0
def lookup_compound_dim(df):
    
    dim_file = get_param('DIM','compound')
    compound_dim = get_dictionary(dim_file)

    replaced_dims = []
    i = 1
    for key in compound_dim:
        table_name = "dim_" + key
        dim_df = read_table_to_df(table_name)
        j = 1    
        for column_name in compound_dim[key]:
            if column_name in df:
                if i == 1 and j == 1:
                    replaced_df = replace_attrib_for_id(df, dim_df, column_name, key + "_desc")
                else:
                    replaced_df = replace_attrib_for_id(replaced_df, dim_df, column_name, key + "_desc")
                j += 1        
            else:
                raise Exception("ERROR: The column "+ column_name + " does not exist - Verify and correct " + dim_file)
        replaced_dims.append(key)
        i += 1
    if len(compound_dim) != len(replaced_dims):
        raise Exception("ERROR: there were some dimensions that couldn't been created - \n \
                Dimension List: "+ ','.join(list(compound_dim.keys()))+ "\nCreated Dimensions: " \
                    + ','.join(replaced_dims))
    
    return replaced_df
Beispiel #6
0
def lookup_default_dim(df):
    
    dim_file = get_param('DIM','default')
    default_dim = get_dictionary(dim_file)

    replaced_dims = []
    i = 1
    for key in default_dim:
        
        if default_dim[key] in df:
            table_name = "dim_" + key
            dim_df = read_table_to_df(table_name)
            if i == 1:
                replaced_df = replace_attrib_for_id(df, dim_df, default_dim[key], key + "_desc")
            else:
                replaced_df = replace_attrib_for_id(replaced_df, dim_df, default_dim[key], key + "_desc")
            replaced_dims.append(key)
        else:
            raise Exception("ERROR: The column "+ default_dim[key] + " does not exist - Verify and correct " + dim_file)
        i +=1
    if len(default_dim) != len(replaced_dims):
        raise Exception("ERROR: there were some dimensions that couldn't been created - \n \
            Dimension List: "+ ','.join(list(default_dim.keys()))+ "\nCreated Dimensions" \
                + ','.join(replaced_dims))
    
    return replaced_df
Beispiel #7
0
def extract_data():
    #Recovering API parameters
    client_name = get_param('API', 'client_name')
    file_name = get_param('API', 'file_name')
    limit_size = get_param('API', 'limit_size')
    #limit_size = 20000
    client = Socrata(client_name, None)
    try:
        results = client.get(file_name, limit=limit_size)
        #results = client.get_all(file_name)

        df = pd.DataFrame.from_records(results)

        df.columns = df.columns.str.replace(' ', '_')
        df.columns = df.columns.str.replace('sytem', 'system')

    except:
        print("Something went wrong while retrieving data.")

    return df