コード例 #1
0
def download__update_chart_by_parameters(signal, start_date, end_date, limit):
    if signal is None:
        raise PreventUpdate

    logging.info(f'download__update_chart_by_parameters - start_date: {start_date}; '
                 f'end_date: {end_date}; limit: {limit}')

    start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date)

    # if start date is greater than end date or limit is None,
    # then the callback returns an empty object
    if start_date > end_date or limit is None:
        return {'data': [], 'layout': {}, 'frames': []}

    sub_df = __create_sub_df_based_on_parameters(
        # get data from cache
        cache.get('download:df_download'), start_date, end_date, limit
    )

    # filter the previous dataframe to get the number of downloaded scenes by date only
    sub_df = sub_df.groupby(['date'])['number'].sum().to_frame('number').reset_index()
    sub_df = sub_df.sort_values(['date'], ascending=True)

    return __get_figure_of_number_of_downloaded_scenes_time_series(
        sub_df,
        title='Time Series: Number of Download Scenes by Date'
    )
コード例 #2
0
def download__update_information_table(signal, start_date, end_date):
    if signal is None:
        raise PreventUpdate

    logging.info(f'download__update_information_table - start_date: {start_date}; '
                 f'end_date: {end_date}')

    start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date)

    # if start date is greater than end date or limit is None,
    # then the callback returns an empty object
    if start_date > end_date:
        return []

    sub_df = __create_sub_df_based_on_parameters(
        # get data from cache
        cache.get('download:df_base'), start_date, end_date
    )

    # update `Number of downloaded scenes (by range)` data
    df_information.loc[2, 'value'] = len(sub_df.index)

    # update `Number of downloaded assets (by range)` data
    df_information.loc[3, 'value'] = sub_df['nofbi'].sum()

    # return the new information table
    return df_information.to_dict('records')
コード例 #3
0
def scene__update_information_table(signal, start_date, end_date):
    if signal is None:
        raise PreventUpdate

    logging.info(
        f'scene__update_information_table - start_date: {start_date};  end_date: {end_date}'
    )

    start_date, end_date = __convert_dates_from_str_to_date(
        start_date, end_date)

    # if start date is greater than end date or limit is None, then the callback returns an empty object
    if start_date > end_date:
        return []

    sub_df = __create_sub_df_based_on_parameters(
        # get data from cache
        cache.get('scene:df_base'),
        start_date,
        end_date)

    # update `Number of items (by range)` data
    df_information.loc[3, 'value'] = len(sub_df.index)

    # update `Number of distinct scenes (by range)` data
    df_information.loc[4, 'value'] = len(sub_df.item_id.unique())

    # return the new information table
    return df_information.to_dict('records')
コード例 #4
0
    def execute(self, query, is_transaction=False):
        logging.info('MySQLConnection.execute()')

        try:
            logging.info('MySQLConnection.execute() - query: %s\n', query)

            self.try_to_connect()

            if not is_transaction:
                return read_sql(query, con=self.engine)

            # if it is a transaction
            with self.engine.begin() as connection:  # runs a transaction
                connection.execute(query)

        except SQLAlchemyError as error:
            # self.rollback()
            error_message = 'An error occurred during query execution'

            logging.error('MySQLConnection.execute() - error.code: %s', error.code)
            logging.error('MySQLConnection.execute() - error.args: %s', error.args)
            logging.error('MySQLConnection.execute() - %s: %s\n', error_message, error)

            error_message += ': ' + str(error.args)

            raise Exception(error_message)

        # finally is always executed (both at try and except)
        finally:
            self.close()
コード例 #5
0
def scene__update_map_by_parameters(signal, start_date, end_date):
    if signal is None:
        raise PreventUpdate

    logging.info(
        f'scene__update_map_by_parameters - start_date: {start_date}; end_date: {end_date}'
    )

    start_date, end_date = __convert_dates_from_str_to_date(
        start_date, end_date)

    # if start date is greater than end date or limit is None, then the callback returns an empty object
    if start_date > end_date:
        return dicts_to_geojson([])

    # convert the dates from datetime to str again in order to pass the xaxis range to build the figure
    xaxis_range = [
        start_date.strftime('%Y-%m-%d'),
        end_date.strftime('%Y-%m-%d')
    ]

    # get data from cache
    df_sd_ds_ym_long_lat = cache.get('scene:df_sd_ds_ym_long_lat')

    # when user selects an invalid range, the df has len equals to 0,
    # then the callback returns an empty object to avoid error message
    if len(df_sd_ds_ym_long_lat.index) == 0:
        return dicts_to_geojson([])

    # get a sub set from the df according to the selected date range
    sub_df = df_sd_ds_ym_long_lat[__get_logical_date_range(
        df_sd_ds_ym_long_lat, xaxis_range)]

    # build the geojson object with a list of markers
    return __get_geojson_data(sub_df)
コード例 #6
0
def scene__update_graph_x_number_of_scenes_based_on_date_picker_range(
        signal, start_date, end_date):
    if signal is None:
        raise PreventUpdate

    logging.info(
        f'update_graph_number_of_scenes - start_date: {start_date}; end_date: {end_date}'
    )

    start_date, end_date = __convert_dates_from_str_to_date(
        start_date, end_date)

    # if start date is greater than end date or limit is None, then the callback returns an empty object
    if start_date > end_date:
        return {'data': [], 'layout': {}, 'frames': []}

    # convert the dates from datetime to str again in order to pass the xaxis range to build the figure
    xaxis_range = [
        start_date.strftime('%Y-%m-%d'),
        end_date.strftime('%Y-%m-%d')
    ]

    return get_figure_of_graph_bar_plot_number_of_scenes(
        # get data from cache
        cache.get('scene:df_sd_dataset_year_month'),
        xaxis_range=xaxis_range,
        title='Number of Scenes by Dataset and Year-Month')
コード例 #7
0
def get_figure_of_graph_bar_plot_number_of_scenes(df, xaxis_range=[], title=None):

    figure_height = 800
    df_copy = df.copy()

    logging.info(f'get_figure_of_graph_bar_plot_number_of_scenes - df_copy.head(): \n{df_copy.head()}\n')
    logging.info(f'get_figure_of_graph_bar_plot_number_of_scenes - xaxis_range: {xaxis_range}\n')

    logical_date_range = __get_logical_date_range(df_copy, xaxis_range)

    # I'm goint to build the `data` parameter of `Figure`
    data = []

    # I would like to build each `bar` based on each dataset
    for dataset in df_copy['collection'].unique():
        sub_df = df_copy[(df_copy['collection'] == dataset) & logical_date_range]

        hovertext = 'Number of Scenes: ' + sub_df['number'].map(str) + '<br>' + \
                    'Period: ' + sub_df['year_month'].map(str) + '<br>' + \
                    'Dataset: ' + sub_df['collection'].map(str)

        data.append(Bar({
            'x': sub_df['year_month'],
            'y': sub_df['number'],
            'name': dataset,
            'text': sub_df['number'],  # text inside the bar
            'textposition': 'auto',
            'hovertext': hovertext,
        }))

    fig = Figure({
        'data': data,
        'layout': {
            'title': title,
            'xaxis': {'title': 'Period'},
            'yaxis': {'title': 'Number of scenes'},
            'plot_bgcolor': colors['background'],
            'paper_bgcolor': colors['background'],
            'font': {
                'color': colors['text']
            }
        }
    })

    fig.update_layout(
        barmode='group',
        height=figure_height,
        xaxis_tickangle=-45
    )

    return fig
コード例 #8
0
    def __configure_df_user(self):
        logging.info('**************************************************')
        logging.info('*              __configure_df_user               *')
        logging.info('**************************************************')

        # rename columns
        self.df_user.rename(columns={'userId': 'username'}, inplace=True)
        self.df_user.rename(columns={'addressId': 'address_id'}, inplace=True)
        self.df_user.rename(columns={'fullname': 'name'}, inplace=True)
        self.df_user.rename(columns={'registerDate': 'created_at'},
                            inplace=True)
        self.df_user.rename(columns={'areaCode': 'ddd'}, inplace=True)
        self.df_user.rename(columns={'company': 'company_name'}, inplace=True)
        self.df_user.rename(columns={'companyType': 'company_type'},
                            inplace=True)
        self.df_user.rename(columns={'activity': 'company_activity'},
                            inplace=True)

        # delete unnecessary columns
        del self.df_user['CNPJ_CPF']
        del self.df_user['compCNPJ']
        del self.df_user['fax']
        del self.df_user['userType']
        del self.df_user['userStatus']
        del self.df_user['unblockDate']
        del self.df_user['siape']

        # fix cases
        self.df_user['username'] = self.df_user['username'].str.lower(
        ).replace(' ', '_')
        self.df_user['name'] = self.df_user['name'].str.title()

        # ignore single quote to save in the postgres db
        self.df_user.replace("'", "", regex=True, inplace=True)
        # remove unnecessary chars
        self.df_user.replace("%", "", regex=True, inplace=True)
        self.df_user.replace(r"\\", "", regex=True, inplace=True)

        self.__configure_df_user__fix_columns_types()

        # if there is an invalid `created_at` cell, then I add the above cell in the place
        for i in range(1, len(self.df_user.index)):
            if self.df_user.loc[i, 'created_at'] == '0000-00-00 00:00:00' or \
                    self.df_user.loc[i, 'created_at'] == '':
                self.df_user.loc[i,
                                 'created_at'] = self.df_user.loc[i - 1,
                                                                  'created_at']

        # if username is empty, then fill it with email data
        self.df_user['username'] = self.df_user.apply(
            lambda row: row['username']
            if row['username'] != '' else row['email'],
            axis=1)

        self.__fix_duplicated_user()

        # generate INSERT clause for each row
        # self.df_user['insert'] = self.df_user.apply(generate_user_insert_clause, axis=1)

        logging.info(f'df_user: \n{self.df_user.head()}\n')
コード例 #9
0
    def __main__get_dfs_configure_dfs_and_save_dfs(self,
                                                   is_to_get_dfs_from_db=True):
        logging.info('**************************************************')
        logging.info('*                 main - settings                *')
        logging.info('**************************************************')

        if is_to_get_dfs_from_db:
            # delete and recreate `assets` folder
            delete_and_recreate_folder(DATA_PATH)
            logging.info(
                f'`{DATA_PATH}` folder has been recreated successfully!\n')

            # remove invalid rows from database, if they exist
            self.__remove_invalid_rows_from_database()

            # get dataframes from database and save them in CSV files
            self.__get_dfs_from_mysqldb()
            self.__save_dfs()

        # get the saved dataframes
        self.__get_dfs_from_csv_files()

        # configure dataframes
        self.__configure_df_address()
        self.__configure_df_user()
        self.__configure_df_user_address()
        self.__configure_df_location()
        self.__configure_df_download()

        # save a new version of the dataframes after modifications
        self.__save_dfs(address_file_name='address_configured.csv',
                        user_file_name='user_configured.csv',
                        user_address_file_name='user_address_configured.csv',
                        location_file_name='location_configured.csv',
                        download_file_name='download_configured.csv')
コード例 #10
0
def scene__date_picker_range__event__button_submit(n_clicks, start_date,
                                                   end_date):
    """This event is called after user submit button in order to update the caching data."""

    logging.info(
        f'scene__date_picker_range__event__button_submit - n_clicks: {n_clicks}'
    )
    logging.info(
        f'scene__date_picker_range__event__button_submit - start_date: {start_date}; end_date: {end_date}'
    )

    _start_date, _end_date = __convert_dates_from_str_to_date(
        start_date, end_date)

    # if start date is greater than end date or limit is None, then the callback returns an empty object
    if _start_date > _end_date:
        raise PreventUpdate

    # create df_base and save it in the cache
    df_base = create_df_base(start_date, end_date)

    logging.info(
        f'scene__date_picker_range__event__button_submit - df_base has been created!'
    )

    # create auxiliar dfs and save them in the cache
    create_df_sd_dataset_year_month(df_base)
    create_df_sd_ds_ym_long_lat(df_base)

    logging.info(
        f'scene__date_picker_range__event__button_submit - auxiliar dfs have been created!'
    )

    return n_clicks
コード例 #11
0
def create_df_sd_ds_ym_long_lat(df_base):
    # I group my df by 'collection', 'year_month', longitude' and 'latitude' to build the map
    df_sd_ds_ym_long_lat = filter_df_by(
        df_base,
        group_by=['collection', 'year_month', 'longitude', 'latitude'],
        sort_by=['year_month', 'collection', 'longitude', 'latitude']
    )

    logging.info(f'create_df_sd_ds_ym_long_lat - df_sd_ds_ym_long_lat.head(): \n{df_sd_ds_ym_long_lat.head()}\n')

    # save the variable inside the cache
    cache.set('scene:df_sd_ds_ym_long_lat', df_sd_ds_ym_long_lat)

    return df_sd_ds_ym_long_lat
コード例 #12
0
def __get_logical_date_range(df, xaxis_range=None):
    # if there are values, then get a boolean df according to the selected date range
    if xaxis_range:
        # [:-3] - extract the string without the last 3 chars, in other words, I get just the year and month
        start_date = xaxis_range[0][:-3]
        end_date = xaxis_range[1][:-3]

        logging.info(f'__get_logical_date_range - start_date: {start_date}')
        logging.info(f'__get_logical_date_range - end_date: {end_date}\n')

        # extract a boolean df from the original one by the selected date range
        return ((df['year_month'] >= start_date) & (df['year_month'] <= end_date))
    else:
        raise CatalogDashException('Invalid `xaxis_range`, it is empty!')
コード例 #13
0
def create_df_sd_dataset_year_month(df_base):
    # I group my df by 'collection' and 'year_month' to build the table
    df_sd_dataset_year_month = filter_df_by(
        df_base,
        group_by=['collection', 'year_month'],
        sort_by=['year_month', 'collection'],
        ascending=False
    )

    logging.info(f'create_df_sd_dataset_year_month - df_sd_dataset_year_month.head(): \n{df_sd_dataset_year_month.head()}\n')

    # save the variable inside the cache
    cache.set('scene:df_sd_dataset_year_month', df_sd_dataset_year_month)

    return df_sd_dataset_year_month
コード例 #14
0
        def wrapper(*args, **kwargs):

            attempts = 1

            while attempts <= max_retries:
                logging.info(f'max retries to connect: `{max_retries}``.')
                logging.info(f'wait `{wait}` secs.')

                try:
                    return function(*args, **kwargs)
                except SQLAlchemyError:
                    logging.error(
                        f'try to reconnect... attempts: `{attempts}`')
                    attempts += 1
                    sleep(wait)

            logging.error(f'Max retries was exceeded...')
コード例 #15
0
ファイル: layout.py プロジェクト: inpe-cdsr/catalog-dash
def create_df_download(df_base):
    # df_download - number of downloaded scenes by user, date and long/lat
    # this df contains all columns I need to build the tables and charts
    df_download = filter_df_by(
        df_base,
        count='item_id',
        group_by=['collection', 'satellite_sensor', 'email', 'name', 'date', 'longitude', 'latitude'],
        sort_by=['number'],
        ascending=False
    )

    logging.info(f'create_df_download - df_download.head(): \n{df_download.head()}\n')

    # save the variable inside the cache
    cache.set('download:df_download', df_download)

    return df_download
コード例 #16
0
    def __remove_invalid_rows_from_database(self):
        logging.info('**************************************************')
        logging.info('*      __remove_invalid_rows_from_database       *')
        logging.info('**************************************************')

        # remove invalid rows from Download table, if they exist
        self.db_mysql.execute(
            'DELETE FROM Download WHERE CHAR_LENGTH(sceneId) < 10;',
            is_transaction=True)

        logging.info('Invalid rows have been removed.\n')
コード例 #17
0
    def __recreate_tables_in_the_database(self):
        """Recreate the tables in the PostgreSQL database"""

        logging.info('**************************************************')
        logging.info('*       __recreate_tables_in_the_database        *')
        logging.info('**************************************************')

        self.db_postgres.init_db()

        logging.info(
            'All tables have been recreated in the database successfully!\n')
コード例 #18
0
def download__update_tables_by_parameters(signal, start_date, end_date, limit):
    if signal is None:
        raise PreventUpdate

    logging.info(f'download__update_tables_by_parameters - start_date: {start_date}; '
                 f'end_date: {end_date}; limit: {limit}')

    start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date)

    # if start date is greater than end date or limit is None,
    # then it prevents to update the tables
    if start_date > end_date or limit is None:
        raise PreventUpdate

    # filter base dataframe based on start date, end date and limit
    sub_df = __create_sub_df_based_on_parameters(
        # get data from cache
        cache.get('download:df_download'), start_date, end_date, limit
    )

    # filter the sub dataframe to get the number of downloaded items by collection
    sub_df_nodib_collection = sub_df.groupby(['collection'])['number'] \
                                        .sum().to_frame('number').reset_index() \
                                        .sort_values(['number'], ascending=False)

    # filter the sub dataframe to get the number of downloaded items by satellite and sensor
    sub_df_nodib_satellite_sensor = sub_df.groupby(['satellite_sensor'])['number'] \
                                            .sum().to_frame('number').reset_index() \
                                            .sort_values(['number'], ascending=False)

    # filter the sub dataframe to get the number of downloaded items by user and date
    sub_df_nodib_user_date = sub_df.groupby(['email', 'name', 'date'])['number'] \
                                        .sum().to_frame('number').reset_index() \
                                        .sort_values(['number'], ascending=False)

    # filter the previous dataframe to get the number of downloaded items by date only
    sub_df_nodib_date = sub_df_nodib_user_date.groupby(['date'])['number'] \
                                                .sum().to_frame('number').reset_index() \
                                                .sort_values(['number'], ascending=False)

    return sub_df_nodib_collection.to_dict('records'), \
            sub_df_nodib_satellite_sensor.to_dict('records'), \
            sub_df_nodib_date.to_dict('records'), \
            sub_df_nodib_user_date.to_dict('records')
コード例 #19
0
ファイル: layout.py プロジェクト: inpe-cdsr/catalog-dash
def get_data_from_db(start_date=None, end_date=None):
    """Gets data from database."""

    # postgres connection
    db = PostgreSQLRegister(database=PGDB_REGISTER)

    # df_base - `number of downloaded files by items` dataframe
    df_base = db.select_from_download_nofbi(start_date, end_date)

    logging.info(f'get_data_from_db - df_base size: {len(df_base.index)}')
    logging.info(
        'get_data_from_db - df_base.head(): \n'
        f"{df_base[['nofbi', 'item_id', 'username', 'date', 'longitude', 'latitude']].head()}\n"
    )

    # save the variable inside the cache
    cache.set('download:df_base', df_base)

    return df_base
コード例 #20
0
    def __fix_sequences_in_the_database(self):
        """Fix the tables sequences in the PostgreSQL database."""

        logging.info('**************************************************')
        logging.info('*        __fix_sequences_in_the_database         *')
        logging.info('**************************************************')

        self.db_postgres.fix_sequences()

        logging.info(
            'All sequences have been recreated in the database successfully!\n'
        )
コード例 #21
0
def download__update_map_by_parameters(signal, start_date, end_date, limit):
    if signal is None:
        raise PreventUpdate

    logging.info(f'download__update_map_by_parameters - start_date: {start_date}; '
                 f'end_date: {end_date}; limit: {limit}')

    start_date, end_date = __convert_dates_from_str_to_date(start_date, end_date)

    # if start date is greater than end date or limit is None,
    # then the callback returns an empty object
    if start_date > end_date or limit is None:
        return dicts_to_geojson([])

    sub_df = __create_sub_df_based_on_parameters(
        # get data from cache
        cache.get('download:df_download'), start_date, end_date, limit
    )

    # build the geojson object with a list of markers
    return __get_geojson_data(sub_df)
コード例 #22
0
    def __main__clear_and_insert_values_in_the_database(self):
        logging.info('**************************************************')
        logging.info('*__main__clear_and_insert_values_in_the_database *')
        logging.info('**************************************************')

        # initialize database before inserting records
        self.__recreate_tables_in_the_database()

        # insert rows
        self.__insert_df_into_database(self.df_location, df_name='df_location')
        # sort the df by address_id for eficient deletes afterwards
        df_sorted_by_address_id = self.df_user_address.sort_values(
            'address_id')
        self.__insert_df_into_database(df_sorted_by_address_id,
                                       df_name='df_user_address',
                                       insert_column='insert_address')
        self.__insert_df_into_database(self.df_user_address,
                                       df_name='df_user_address',
                                       insert_column='insert_user')
        self.__insert_df_into_database(self.df_download,
                                       df_name='df_download',
                                       chunks=40000)

        # fix sequences after inserting records
        self.__fix_sequences_in_the_database()
コード例 #23
0
    def __get_dfs_from_mysqldb(self):
        logging.info('**************************************************')
        logging.info('*            __get_dfs_from_mysqldb              *')
        logging.info('**************************************************')

        # get the dfs from database
        self.df_download = self.db_mysql.select_from_download()
        self.df_user = self.db_mysql.select_from_user()
        self.df_address = self.db_mysql.select_from_address()
        self.df_location = self.db_mysql.select_from_location()

        # create an empty df just to avoid errors
        self.df_user_address = DataFrame({'test': [1]})

        logging.info('Dataframes have been loaded successfully.\n')
コード例 #24
0
    def __configure_df_location(self):
        logging.info('**************************************************')
        logging.info('*            __configure_df_location             *')
        logging.info('**************************************************')

        # rename columns
        self.df_location.rename(columns={'timestamp': 'created_at'},
                                inplace=True)

        # scape single quote to save in the postgres db
        self.df_location.replace("'", "''", regex=True, inplace=True)

        self.__configure_df_location__fix_columns_types()

        # generate INSERT clause for each row
        self.df_location['insert'] = self.df_location.apply(
            generate_location_insert_clause, axis=1)

        logging.info(f'df_location: \n{self.df_location.head()}\n')
コード例 #25
0
    def __main_read_dataframes_from_csv_files(self):
        logging.info('**************************************************')
        logging.info('*           __main_initialize_dataframes         *')
        logging.info('**************************************************')

        # read CSV files
        self.__get_dfs_from_csv_files(
            address_file_name='address_configured.csv',
            user_file_name='user_configured.csv',
            user_address_file_name='user_address_configured.csv',
            location_file_name='location_configured.csv',
            download_file_name='download_configured.csv')

        # configure dataframes
        self.__configure_df_address__fix_columns_types()
        self.__configure_df_user__fix_columns_types()
        self.__configure_df_user_address__fix_columns_types()
        self.__configure_df_location__fix_columns_types()
        self.__configure_df_download__fix_columns_types()

        logging.info('Dataframes have been initialized successfully.\n')
コード例 #26
0
    def __configure_df_user_address(self):
        logging.info('**************************************************')
        logging.info('*          __configure_df_user_address           *')
        logging.info('**************************************************')

        # merge dataframes
        self.df_user_address = merge(self.df_user,
                                     self.df_address,
                                     how='left',
                                     on='address_id')

        self.__fix_duplicated_address_id()

        # generate INSERT clause for each row
        self.df_user_address['insert_address'] = self.df_user_address.apply(
            generate_address_insert_clause, axis=1)
        self.df_user_address['insert_user'] = self.df_user_address.apply(
            generate_user_insert_clause, axis=1)

        logging.info(
            f'self.df_user_address: \n{self.df_user_address.head()}\n')
コード例 #27
0
def download__date_picker_range__event__button_submit(n_clicks, start_date, end_date):
    """This event is called after user submit button in order to update the caching data."""

    logging.info(f'download__date_picker_range__event__button_submit - n_clicks: {n_clicks}')
    logging.info(f'download__date_picker_range__event__button_submit - start_date: {start_date}; end_date: {end_date}')

    _start_date, _end_date = __convert_dates_from_str_to_date(start_date, end_date)

    # if start date is greater than end date or limit is None, then the callback returns an empty object
    if _start_date > _end_date:
        raise PreventUpdate

    # create df_download and save it in the cache
    df_base = get_data_from_db(start_date, end_date)
    logging.info(f'download__date_picker_range__event__button_submit - df_download has been created!')

    # create auxiliar df and save it in the cache
    create_df_download(df_base)
    logging.info(f'download__date_picker_range__event__button_submit - auxiliar df has been created!')

    return n_clicks
コード例 #28
0
def create_df_base(start_date=None, end_date=None):
    logging.info(f'create_df_base - start_date: {start_date}; end_date: {end_date}')

    df_base = copy_and_organize_df(get_data_from_db(start_date=start_date, end_date=end_date))

    logging.info(f'create_df_base - df_base size: {len(df_base.index)}')
    logging.info(f'create_df_base - df_base.head(): \n{df_base.head()}\n')
    # memory_usage = df_base.memory_usage(index=True).sum()
    # logging.info(f'create_df_base - df_base (df) memory_usage: {bytesto(memory_usage, to="m")} MB\n')

    # save the variable inside the cache
    cache.set('scene:df_base', df_base)

    return df_base
コード例 #29
0
    def __save_dfs(self,
                   download_file_name='download.csv',
                   user_file_name='user.csv',
                   address_file_name='address.csv',
                   user_address_file_name='user_address.csv',
                   location_file_name='location.csv'):
        """Save the dataframes in CSV files"""

        logging.info('**************************************************')
        logging.info('*                   __save_dfs                   *')
        logging.info('**************************************************')

        self.df_location.to_csv(DATA_PATH + location_file_name, index=False)
        self.df_user.to_csv(DATA_PATH + user_file_name, index=False)
        self.df_address.to_csv(DATA_PATH + address_file_name, index=False)
        self.df_user_address.to_csv(DATA_PATH + user_address_file_name,
                                    index=False)
        self.df_download.to_csv(DATA_PATH + download_file_name, index=False)

        logging.info(
            f'`{download_file_name}`, `{user_file_name}`, `{address_file_name}`, '
            f'`{user_address_file_name}` and `{location_file_name}` files '
            'have been saved successfully!\n')
コード例 #30
0
    def __configure_df_address(self):
        logging.info('**************************************************')
        logging.info('*            __configure_df_address              *')
        logging.info('**************************************************')

        # rename columns
        self.df_address.rename(columns={'addressId': 'address_id'},
                               inplace=True)

        # delete unnecessary columns
        del self.df_address['addressType']
        del self.df_address['CNPJ_CPF']
        del self.df_address['compCNPJ']
        del self.df_address['digitCNPJ']
        del self.df_address['delivery']
        del self.df_address['payment']
        del self.df_address['userId']

        # fix cases
        self.df_address['street'] = self.df_address['street'].str.title()
        self.df_address['number'] = self.df_address['number'].str.strip()
        self.df_address['city'] = self.df_address['city'].str.title()
        self.df_address['state'] = self.df_address['state'].str.upper()
        self.df_address['country'] = self.df_address['country'].str.title()

        # remove unnecessary chars
        self.df_address.replace("%", "", regex=True, inplace=True)
        # ignore single quote to save in the postgres db
        self.df_address.replace("'", "", regex=True, inplace=True)

        self.__configure_df_address__fix_columns_types()

        # generate INSERT clause for each row
        # self.df_address['insert'] = self.df_address.apply(generate_address_insert_clause, axis=1)

        logging.info(f'df_address: \n{self.df_address.head()}\n')