Python PythonClickhouse Exemples, scripts.databases.pythonClickhouse.PythonClickhouse Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : mytab_interface.py Projet : andre-aion/analytics_demo

    def __init__(self, table, cols, dedup_cols, panel_title=None):
        self.panel_title = panel_title
        self.table = table
        self.load_params = dict()
        self.cols = cols
        self.locals = dict()  # stuff local to each tab
        self.streaming_dataframe = SD(table, cols, dedup_cols)
        self.df = self.streaming_dataframe.df
        self.df1 = None
        self.dedup_cols = dedup_cols
        self.params = None
        self.load_params = None
        self.poolname_dict = self.get_poolname_dict()
        self.key_tab = ''  # for key composition in redis
        self.construction_tables = {}
        self.tier1_miners_list = []
        self.tier2_miners_list = []
        self.pq = PythonParquet()
        self.ch = PythonClickhouse('aion')
        self.redis = PythonRedis()
        self.conn = self.redis.conn
        self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
        self.ToA_THRESH = {  # Tests of association (TOA)
            'STRONG': .65,
            'MODERATE': .4,
            'WEAK': .25
        }
        self.menus = {'resample_periods': ['D', 'W', 'M', 'Q']}
        self.resample_period = self.menus['resample_periods'][0]
        self.pvalue_thresh = 0.1

        self.page_width = 1200

Exemple #2

0

Afficher le fichier

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self,
                           table,
                           cols,
                           dedup_cols,
                           panel_title=panel_title)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')
            self.items = cryptos
            # add all the coins to the dict
            self.github_cols = [
                'watch', 'fork', 'issue', 'release', 'push', 'tw_mentions',
                'tw_positive', 'tw_compound', 'tw_neutral', 'tw_negative',
                'tw_emojis_positive', 'tw_emojis_compound',
                'tw_emojis_negative', 'tw_emojis_count', 'tw_reply_hashtags'
            ]
            self.index_cols = ['close', 'high', 'low', 'market_cap', 'volume']

            self.trigger = 0
            txt = """<div style="text-align:center;background:black;width:100%;">
                                                                           <h1 style="color:#fff;">
                                                                           {}</h1></div>""".format(
                'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=1400, height=20),
                'bottom': Div(text=txt, width=1400, height=10),
            }
            self.cluster_dct = cluster_dct
            self.groupby_dict = groupby_dict
            self.features = features
            self.crypto = 'all'

            self.div_style = """ style='width:350px; margin-left:25px;
                                    border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                    """

            self.header_style = """ style='color:blue;text-align:center;' """

            self.significant_effect_dict = {}
            self.df1 = None
            self.section_headers = {
                'ts':
                self.section_header_div(
                    'Comparison of clusters across variables:---------------------',
                    width=600)
            }
            self.timestamp_col = None
            self.colors = None

Exemple #3

0

Afficher le fichier

Fichier : cryptocurrency.py Projet : andre-aion/analytics_demo

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')
            # add all the coins to the dict
            self.github_cols = ['watch', 'fork', 'issue', 'release', 'push']
            self.index_cols = ['close', 'high', 'low', 'market_cap', 'volume']

            self.trigger = 0

            self.groupby_dict = groupby_dict
            self.feature_list = list(self.groupby_dict.keys())
            self.kmean_model = {}

            self.div_style = """ style='width:350px; margin-left:25px;
                            border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                            """

            self.header_style = """ style='color:blue;text-align:center;' """

            self.k = '1'
            self.max_clusters_menu = [str(k) for k in range(1, 12)]

            self.launch_cluster_table = False  # launch cluster
            self.cryptos = None
            # ------- DIVS setup begin
            self.page_width = 1200
            txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                        position:relative;background:black;margin-bottom:200px">
                        <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                  </div>""".format(self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }

            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'Crypto families':
                self.section_header_div(text='Crypto families:{}'.format(
                    self.section_divider),
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
            }

Exemple #4

0

Afficher le fichier

Fichier : economic_indicators.py Projet : andre-aion/analytics_demo

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')

            self.trigger = 0

            self.groupby_dict = {}

            self.div_style = """ style='width:350px; margin-left:25px;
                                    border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                    """

            self.header_style = """ style='color:blue;text-align:center;' """
            self.countries = []
            self.country = 'Barbados'

            self.relationships_to_check = ['weak', 'moderate', 'strong']

            self.pym = PythonMongo('aion')
            self.menus = {
                'status': ['all', 'open', 'closed'],
                'gender': ['all', 'male', 'female'],
            }
            self.multiline_vars = {'x': '', 'y': ''}
            self.timestamp_col = 'timestamp'

            # ------- DIVS setup begin
            self.page_width = 1200
            txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                                                position:relative;background:black;margin-bottom:200px">
                                                <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                                          </div>""".format(
                self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }
            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'info': self.section_header_div(text='Country indexes')
            }

Exemple #5

0

Afficher le fichier

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')

            self.trigger = 0

            self.groupby_dict = {}

            self.div_style = """ style='width:350px; margin-left:25px;
                                    border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                    """

            self.header_style = """ style='color:blue;text-align:center;' """
            self.variable = 'delay_end'

            self.relationships_to_check = ['weak', 'moderate', 'strong']

            self.status = 'all'
            self.gender = 'all'
            self.type = 'all'

            self.pym = PythonMongo('aion')
            self.menus = {
                'status': ['all', 'open', 'closed'],
                'gender': ['all', 'male', 'female'],
            }
            self.multiline_vars = {
                'x': 'manager_gender',
                'y': 'remuneration'
            }
            self.timestamp_col = 'analysis_date'

            self.risks = []
            self.risk = ''
            self.matrices = []
            self.matrix = ''
            self.risk_select = Select(title='Select risk', value=self.risk, options=self.risks)
            self.risk_threshold = {
                'acceptable': 8,
                'doubtful': 15
            }

            # ------- DIVS setup begin
            self.page_width = 1200
            txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                                                position:relative;background:black;margin-bottom:200px">
                                                <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                                          </div>""".format(self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }
            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'Customers':self.section_header_div(text='Events:',
                                               width=1000, html_header='h2', margin_top=50, margin_bottom=-155),
                'Events': self.section_header_div(text='Events:',
                                               width=1000, html_header='h2', margin_top=50, margin_bottom=-155),
                'Patrons': self.section_header_div(text='Patrons:',
                                                        width=600, html_header='h2', margin_top=5, margin_bottom=-155),
                'Employees': self.section_header_div(text='Employees:'.format(self.section_divider),
                                                  width=600, html_header='h2', margin_top=5, margin_bottom=-155),

            }

Exemple #6

0

Afficher le fichier

Fichier : rentals.py Projet : andre-aion/analytics_demo

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')

            self.trigger = 0
            self.groupby_dict = {
                'category': 'nunique',
                'item': 'nunique',
                'area': 'nunique',
                'visit_duration': 'mean',
                'age': 'mean',
                'gender_coded': 'mean',
                'status_coded': 'mean',
                'rental_employee_gender_coded': 'mean',
                'rental_employee_age': 'mean',
                'rental_tab': 'sum'
            }

            self.feature_list = ['age', 'rental_employee_age', 'rental_tab']
            self.tsa_variable = 'rental_tab'
            self.forecast_days = 40
            self.lag_variable = 'visit_duration'
            self.lag_days = "1,2,3"
            self.lag = 0
            self.lag_menu = [str(x) for x in range(0, 100)]

            self.strong_thresh = .65
            self.mod_thresh = 0.4
            self.weak_thresh = 0.25
            self.corr_df = None
            self.div_style = """ 
                style='width:350px; margin-left:25px;
                border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
            """

            self.header_style = """ style='color:blue;text-align:center;' """

            self.variables = sorted(list(self.groupby_dict.keys()))
            self.variable = 'rental_tab'

            self.relationships_to_check = ['weak', 'moderate', 'strong']

            self.pym = PythonMongo('aion')
            self.menus = {
                'item': ['all'],
                'category': ['all'],
                'status': ['all', 'guest', 'member'],
                'gender': ['all', 'male', 'female'],
                'variables':
                list(self.groupby_dict.keys()),
                'history_periods':
                ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
                'area': ['all', 'bar', 'rentals'],
                'tsa': ['rental_tab', 'visit_duration']
            }
            self.select = {}
            self.select['area'] = Select(title='Select BCC area',
                                         value='all',
                                         options=self.menus['area'])

            self.select['item'] = Select(title='Select item',
                                         value='all',
                                         options=self.menus['item'])

            self.select['status'] = Select(title='Select visitor status',
                                           value='all',
                                           options=self.menus['status'])

            self.select['gender'] = Select(title="Select visitor gender",
                                           value='all',
                                           options=self.menus['gender'])

            self.select['category'] = Select(title="Select category",
                                             value='all',
                                             options=self.menus['category'])

            self.select['rental_employee_gender'] = Select(
                title="Select category",
                value='all',
                options=self.menus['category'])

            self.select_values = {}
            for item in self.select.keys():
                self.select_values[item] = 'all'

            self.multiline_vars = {'x': 'gender', 'y': 'rental_tab'}
            self.timestamp_col = 'visit_start'
            # ------- DIVS setup begin
            self.page_width = 1250
            txt = """<hr/>
                    <div style="text-align:center;width:{}px;height:{}px;
                           position:relative;background:black;margin-bottom:200px">
                           <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                    </div>""".format(self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }
            lag_section_head_txt = 'Lag relationships between {} and...'.format(
                self.variable)

            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'lag':
                self.section_header_div(text=lag_section_head_txt,
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'distribution':
                self.section_header_div(text='Pre-transform distribution:',
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'relationships':
                self.section_header_div(
                    text='Relationships between variables:{}'.format(
                        self.section_divider),
                    width=600,
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
                'correlations':
                self.section_header_div(text='Correlations:',
                                        width=600,
                                        html_header='h3',
                                        margin_top=5,
                                        margin_bottom=-155),
                'forecast':
                self.section_header_div(text='Forecasts:{}'.format(
                    self.section_divider),
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
            }

Exemple #7

0

Afficher le fichier

Fichier : risk_assessment.py Projet : andre-aion/analytics_demo

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')

            self.trigger = 0

            self.groupby_dict = {}

            self.div_style = """ style='width:350px; margin-left:25px;
                                    border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                    """

            self.header_style = """ style='color:blue;text-align:center;' """
            self.variable = 'delay_end'

            self.relationships_to_check = ['weak', 'moderate', 'strong']

            self.status = 'all'
            self.gender = 'all'
            self.type = 'all'
            self.ratings = {
                'severity': {
                    'Insignificant': 1,
                    'Minor': 2,
                    'Moderate': 3,
                    'Critical': 4,
                    'Catastrophic': 5
                },
                'likelihood': {
                    'Unlikely': 1,
                    'Seldom': 2,
                    'Occaisional': 3,
                    'Likely': 4,
                    'Definite': 5
                }
            }

            self.variables = {
                'severity': list(self.ratings['severity'].keys()),
                'likelihood': list(self.ratings['likelihood'].keys()),
            }
            self.pym = PythonMongo('aion')
            self.menus = {
                'status': ['all', 'open', 'closed'],
                'gender': ['all', 'male', 'female'],
            }
            self.multiline_vars = {'x': 'manager_gender', 'y': 'remuneration'}
            self.timestamp_col = 'analysis_date'

            self.risks = []
            self.risk = ''
            self.matrices = []
            self.matrix = ''
            self.risk_select = Select(title='Select risk',
                                      value=self.risk,
                                      options=self.risks)
            self.risk_threshold = {'acceptable': 8, 'doubtful': 15}

            # ------- DIVS setup begin
            self.page_width = 1200
            txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                                                position:relative;background:black;margin-bottom:200px">
                                                <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                                          </div>""".format(
                self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }
            lag_section_head_txt = 'Lag relationships between {} and...'.format(
                self.variable)
            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'lag':
                self.section_header_div(text=lag_section_head_txt,
                                        width=1000,
                                        html_header='h2',
                                        margin_top=50,
                                        margin_bottom=5),
                'distribution':
                self.section_header_div(text='Pre-transform distribution',
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'matrix':
                self.section_header_div(text='Risk Matrix:{}'.format(
                    self.section_divider),
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'risk_solution':
                self.section_header_div(
                    text='Risk Matrix vs Solution :{}'.format(
                        self.section_divider),
                    width=600,
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
            }

Exemple #8

0

Afficher le fichier

Fichier : accounts.py Projet : andre-aion/analytics_demo

        def __init__(self, table, cols, dedup_cols):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = {}  # to contain churned and retained splits
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.rf = {}  # random forest
            self.cl = PythonClickhouse('aion')

            self.forecast_days = 30
            self.interest_var = 'address'
            self.trigger = -1
            self.status = 'all'
            self.update_type = 'all'
            self.status = 'all'
            self.account_type = 'all'
            self.interest_var = 'amount'

            self.pl = {}  # for rf pipeline
            self.div_style = """ style='width:300px; margin-left:25px;
            border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
            """
            self.header_style = """ style='color:blue;text-align:center;' """

            # list of tier specific addresses for prediction
            self.address_list = []
            self.address_select = Select(title='Filter by address',
                                         value='all',
                                         options=[])
            self.address = 'all'
            self.load_data_flag = False
            self.day_diff = 1
            self.groupby_dict = {}
            self.addresses = []

            self.div_style = """ style='width:300px; margin-left:25px;
                        border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                        """
            self.max_loaded_date = None
            self.min_loaded_date = None

            # ------- DIVS setup begin
            self.page_width = 1200
            txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                            position:relative;background:black;margin-bottom:200px">
                            <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                    </div>""".format(self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }

            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'forecast':
                self.section_header_div(text='Forecasts:{}'.format(
                    self.section_divider),
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
            }

Exemple #9

0

Afficher le fichier

Fichier : cryptocurrency.py Projet : andre-aion/analytics_demo

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')
            self.items = cryptos
            # add all the coins to the dict
            self.github_cols = ['watch', 'fork', 'issue', 'release', 'push']
            self.index_cols = ['close', 'high', 'low', 'market_cap', 'volume']

            self.trigger = 0

            self.groupby_dict = groupby_dict
            self.feature_list = list(self.groupby_dict.keys())
            self.variable = 'fork'
            self.crypto = 'all'
            self.lag_variable = 'push'
            self.lag_days = "1,2,3"
            self.lag = 0
            self.lag_menu = [str(x) for x in range(0, 100)]

            self.strong_thresh = .65
            self.mod_thresh = 0.4
            self.weak_thresh = 0.25
            self.corr_df = None
            self.div_style = """ 
                            style='width:350px; margin-left:-600px;
                            border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                        """

            self.header_style = """ style='color:blue;text-align:center;' """
            # track variable for AI for significant effects
            self.adoption_variables = {
                'user': [],
                'developer': ['watch', 'fork']
            }

            self.significant_effect_dict = {}
            self.reset_adoption_dict(self.variable)
            self.relationships_to_check = ['weak', 'moderate', 'strong']
            # ------- DIVS setup begin
            self.page_width = 1250
            txt = """<hr/>
                           <div style="text-align:center;width:{}px;height:{}px;
                                  position:relative;background:black;margin-bottom:200px">
                                  <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                           </div>""".format(self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }
            #self.lag_section_head_txt = 'Lag relationships between {} and...'.format(self.variable)
            self.lag_section_head_txt = 'Lag relationships:'
            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'lag':
                self.section_header_div(text=self.lag_section_head_txt,
                                        width=600,
                                        html_header='h3',
                                        margin_top=5,
                                        margin_bottom=-155),
                'distribution':
                self.section_header_div(
                    text='Pre transform distribution:{}'.format(
                        self.section_divider),
                    width=600,
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
                'relationships':
                self.section_header_div(
                    text='Relationships between variables:'.format(
                        self.section_divider),
                    width=600,
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
                'correlations':
                self.section_header_div(
                    text='non linear relationships between variables:',
                    width=600,
                    html_header='h3',
                    margin_top=5,
                    margin_bottom=-155),
                'non_linear':
                self.section_header_div(
                    text='non linear relationships between variables:',
                    width=600,
                    html_header='h3',
                    margin_top=5,
                    margin_bottom=-155),
            }

Exemple #10

0

Afficher le fichier

        def __init__(self, table, cols, dedup_cols):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = {}  # to contain churned and retained splits
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.rf = {}  # random forest
            self.cl = PythonClickhouse('aion')
            self.feature_list = hyp_variables

            self.targets = {
                'classification': {
                    'churned': {
                        'cols': ['churned', 'active'],
                        'target_col': 'status'
                    }
                },
                'regression': {
                    'aion_fork': {
                        'cols': [1, 0],
                        'target_col': 'aion_fork'
                    }
                }
            }
            self.interest_var = 'address'
            self.trigger = -1
            self.status = 'all'

            self.clf = None
            self.pl = {}  # for rf pipeline
            self.div_style = """ style='width:300px; margin-left:25px;
            border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
            """
            self.header_style = """ style='color:blue;text-align:center;' """

            # list of tier specific addresses for prediction
            self.address_list = []
            self.prediction_address_selected = ""
            self.load_data_flag = False
            self.day_diff = 1
            self.groupby_dict = {}
            for col in self.feature_list:
                self.groupby_dict[col] = 'mean'

            self.div_style = """ style='width:300px; margin-left:25px;
                        border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                        """
            self.metrics_div = Div(text='', width=400, height=300)
            self.accuracy_df = None
            self.inspected_variable = 'amount'

            # ------- DIVS setup begin
            self.page_width = page_width
            txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                                                                       position:relative;background:black;margin-bottom:200px">
                                                                       <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                                                                 </div>""".format(
                self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }

            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'churn':
                self.section_header_div(
                    text=
                    'Churned accounts: prediction model accuracy, variable ranking:{}'
                    .format('----'),
                    width=int(self.page_width * .5),
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
                'variable behavior':
                self.section_header_div(text='Variable behavior:{}'.format(
                    self.section_divider),
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'predictions':
                self.section_header_div(
                    text='Select date range to make predictions:{}'.format(
                        self.section_divider),
                    width=int(self.page_width * .5),
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
            }

Exemple #11

0

Afficher le fichier

    class Thistab(Mytab):
        def __init__(self, table, cols, dedup_cols):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = {}  # to contain churned and retained splits
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.rf = {}  # random forest
            self.cl = PythonClickhouse('aion')
            self.feature_list = hyp_variables

            self.targets = {
                'classification': {
                    'churned': {
                        'cols': ['churned', 'active'],
                        'target_col': 'status'
                    }
                },
                'regression': {
                    'aion_fork': {
                        'cols': [1, 0],
                        'target_col': 'aion_fork'
                    }
                }
            }
            self.interest_var = 'address'
            self.trigger = -1
            self.status = 'all'

            self.clf = None
            self.pl = {}  # for rf pipeline
            self.div_style = """ style='width:300px; margin-left:25px;
            border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
            """
            self.header_style = """ style='color:blue;text-align:center;' """

            # list of tier specific addresses for prediction
            self.address_list = []
            self.prediction_address_selected = ""
            self.load_data_flag = False
            self.day_diff = 1
            self.groupby_dict = {}
            for col in self.feature_list:
                self.groupby_dict[col] = 'mean'

            self.div_style = """ style='width:300px; margin-left:25px;
                        border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                        """
            self.metrics_div = Div(text='', width=400, height=300)
            self.accuracy_df = None
            self.inspected_variable = 'amount'

            # ------- DIVS setup begin
            self.page_width = page_width
            txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                                                                       position:relative;background:black;margin-bottom:200px">
                                                                       <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                                                                 </div>""".format(
                self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }

            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'churn':
                self.section_header_div(
                    text=
                    'Churned accounts: prediction model accuracy, variable ranking:{}'
                    .format('----'),
                    width=int(self.page_width * .5),
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
                'variable behavior':
                self.section_header_div(text='Variable behavior:{}'.format(
                    self.section_divider),
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'predictions':
                self.section_header_div(
                    text='Select date range to make predictions:{}'.format(
                        self.section_divider),
                    width=int(self.page_width * .5),
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
            }

            # ----------------------  DIVS ----------------------------

        def section_header_div(self,
                               text,
                               html_header='h2',
                               width=600,
                               margin_top=150,
                               margin_bottom=-150):
            text = """<div style="margin-top:{}px;margin-bottom:-{}px;"><{} style="color:#4221cc;">{}</{}></div>""" \
                .format(margin_top, margin_bottom, html_header, text, html_header)
            return Div(text=text, width=width, height=15)

            # ####################################################
            #              UTILITY DIVS

        def results_div(self, text, width=600, height=300):
            div = Div(text=text, width=width, height=height)
            return div

        def title_div(self, text, width=700):
            text = '<h2 style="color:#4221cc;">{}</h2>'.format(text)
            return Div(text=text, width=width, height=15)

        def reset_checkboxes(self):
            try:
                self.prediction_address_selected = ""
                self.prediction_address_select.value = "all"
            except Exception:
                logger.error('reset checkboxes', exc_info=True)

        ###################################################
        #               I/O
        def load_df(self,
                    start_date="2018-04-25 00:00:00",
                    end_date="2018-12-10 00:00:00"):
            try:
                if isinstance(start_date, str):
                    start_date = datetime.strptime(start_date, self.DATEFORMAT)
                if isinstance(end_date, str):
                    end_date = datetime.strptime(end_date, self.DATEFORMAT)
                self.df_load(start_date, end_date)
                self.df = self.df.fillna(0)
                #self.make_delta()
                #self.df = self.df.set_index('block_timestamp')
                #logger.warning("data loaded - %s",self.df.tail(10))

            except Exception:
                logger.error('load_df', exc_info=True)

        ###################################################
        #               MUNGE DATA
        def make_delta(self):
            try:
                if self.df is not None:
                    if len(self.df) > 0:
                        df = self.df.compute()
                        for col in self.targets:
                            col_new = col + '_diff'
                            df[col_new] = df[col].pct_change()
                            df[col_new] = df[col_new].fillna(0)
                            logger.warning('diff col added : %s', col_new)
                        self.df = self.df.fillna(self.df.mean())
                        self.df = dd.dataframe.from_pandas(df, npartitions=15)
                        # logger.warning('POST DELTA:%s',self.df1.tail(20))

            except Exception:
                logger.error('make delta', exc_info=True)

        def split_df(self, df, target):
            cols = self.target['classification'][target]
            target_col = self.target['classification'][target]
            for val in cols:
                self.df1[val] = df[target_col] == val
            logger.warning(
                "Finished split into churned and retained dataframes")

        ##################################################
        #               EXPLICATORY GRAPHS
        # PLOTS
        def box_plot(self, variable):
            try:
                # logger.warning("difficulty:%s", self.df.tail(30))
                # get max value of variable and multiply it by 1.1
                minv = 0
                maxv = 0
                df = self.df
                if df is not None:
                    if len(df) > 0:
                        minv, maxv = dd.compute(df[variable].min(),
                                                df[variable].max())
                else:
                    df = SD('filter', [variable, 'status'], []).get_df()

                return df.hvplot.box(variable,
                                     by='status',
                                     ylim=(.9 * minv, 1.1 * maxv))
            except Exception:
                logger.error("box plot:", exc_info=True)

        ###################################################
        #               MODELS
        def rf_clf(self):
            try:
                logger.warning("RANDOM FOREST LAUNCHED")

                error_lst = []
                df_temp = self.df
                df_temp = self.normalize(df_temp,
                                         timestamp_col='block_timestamp')
                # if all addresses used filter for only positive transactions

                for target in self.targets['classification']:
                    # filter out joined
                    df = df_temp.copy()
                    if target == 'churned':
                        df = df[df['status'] != 'joined']

                    #logger.warning("line 205: df columns in %s:",df.columns.tolist())
                    df = df.groupby(['address',
                                     'status']).agg(self.groupby_dict)
                    df = df.reset_index()
                    #logger.warning("line 222: df columns in %s:",df.tail(10))

                    df = df.compute()
                    '''
                    # only retain wanted values
                    col_values = list(self.df[self.targets['classification'][target]['target_col']].unique())
                    for val in col_values:
                        if val in self.targets['classification'][target]['cols']:
                            pass
                        else:
                            df[self.targets['classification'][target]['target_col']] = \
                            df[df[self.targets['classification'][target]['cols']] != val]
                    '''
                    X = df[self.feature_list]
                    y = df[self.targets['classification'][target]
                           ['target_col']]
                    #logger.warning('y=:%s',y.head(100))

                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=0.3)
                    self.feature_list = X_train.columns.tolist()

                    self.pl[target] = Pipeline([
                        ('imp',
                         SimpleImputer(missing_values=0, strategy='median')),
                        ('rf',
                         RandomForestClassifier(n_estimators=100,
                                                random_state=42,
                                                max_depth=4,
                                                class_weight='balanced'))
                    ])
                    self.pl[target].fit(X_train, y_train)

                    y_pred = self.pl[target].predict(X_test)
                    error_lst.append(
                        round(100 * metrics.accuracy_score(y_test, y_pred), 2))

                self.accuracy_df = pd.DataFrame({
                    'Outcome':
                    list(self.targets['classification'].keys()),
                    'Accuracy':
                    error_lst,
                })
                #logger.warning('accuracy_df:%s',self.accuracy_df.head())
                #self.make_tree(target=target)

                print('confusion matrix:\n')
                print(confusion_matrix(y_test, y_pred))
                print('classification report:\n')
                print(classification_report(y_test, y_pred))
                #logger.warning("clf model built:%s",self.pl)

            except Exception:
                logger.error("RF:", exc_info=True)

        def accuracy_table(self):
            try:
                columns = self.accuracy_df.columns.tolist()
                return self.accuracy_df.hvplot.table(
                    columns=['Outcome', 'Accuracy'],
                    width=250,
                    title='Prediction accuracy')

            except Exception:
                logger.error("RF:", exc_info=True)

        def prediction_information_div(self, width=350, height=450):
            txt = """
            <div {}>
            <h4 {}>Info </h4>
            <ul style='margin-top:-10px;'>
            <li>
            The table shows the predicted change.</br>
            </li>
            <li>
            For desirable outcomes:
            </br> ... a positive number is good!
            </br> ... the bigger the number the better.
            </br> ... a negative number is bad!
            </br> ... the bigger the negative number the worse it is.
            </li>
            <>
            For non-desirable outcomes:
            </br>... the inverse is true
            </li>
            <li>
            Use the datepicker(s) to select dates for the period desired
            </li>
            </ul>
            </div>

            """.format(self.div_style, self.header_style)
            div = Div(text=txt, width=width, height=height)
            return div

        def metrics_div_update(self, data):
            div_style = """ 
                   style='width:350px;margin-right:-600px;
                   border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
               """
            txt = """<div {}>
            <h4 {}>Prediction Info </h4>
            <ul style='margin-top:-10px;'>
            <li>
            {}% likely to churn
            </li>
            </ul>
            </div>""".format(div_style, self.header_style, data)
            self.metrics_div.text = txt

        def stats_information_div(self, width=400, height=300):
            div_style = """ 
                           style='width:350px;margin-left:-600px;
                           border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                       """
            txt = """
            <div {}>
                   <h4 {}>Metadata Info </h4>
                   <ul>
                   <li >
                   <h4 style='margin-bottom:-2px;'>Table left:</h4>
                   - shows the outcome,</br>
                     and the accuracy in %</br>
                     <strong><i>100% is perfection!</i></strong>
                   </li>
                   <li>
                   <h4 style='margin-bottom:-2px;'>Table right:</h4>
                     - shows the desired outcome, the variables(things Aion controls)
                   </br> and their importance to the particular outcome
                   </br> ...which variable(s) have a greater impact on an outcome.
                   </br>- lower = better
                   </br>- generally only the best ranked 3 matter
                   </br>- business advice: manipulate the top ranked variables to attain desirable outcomes
                   </li>
                   </ul>
            </div>""".format(div_style, self.header_style)
            div = Div(text=txt, width=width, height=height)
            return div

        def load_prediction_df(self, start_date, end_date):
            if isinstance(start_date, date):
                start_date = datetime.combine(start_date, datetime.min.time())
            if isinstance(end_date, date):
                end_date = datetime.combine(end_date, datetime.min.time())
            cols = self.feature_list + ['address', 'block_timestamp']
            self.df_predict = self.cl.load_data(table=self.table,
                                                cols=cols,
                                                start_date=start_date,
                                                end_date=end_date)
            logger.warning('319:in load prediction: %s',
                           self.df_predict.head(5))

        def update_prediction_addresses_select(self):
            self.prediction_address_select.options = ['all']
            if len(self.df_predict) > 0:
                lst = ['all'] + list(
                    self.df_predict['address'].unique().compute())
                self.prediction_address_select.options = lst

        # the period for which the user wants a prediction
        def make_account_predictions(self, launch=-1):
            try:
                logger.warning("MAKE PREDICTIONS LAUNCHED")
                target = list(self.targets['classification'].keys())[0]
                # make
                df = self.df_predict
                #logger.warning("line 363%s",df.head(10))
                # make list of address for prediction select
                # filter if prediction for certain addresses
                #logger.warning('address selected:%s',self.prediction_address_select.value)
                if self.prediction_address_select.value is not None:
                    if len(self.prediction_address_select.value) > 0:
                        if self.prediction_address_select.value not in [
                                'all', ''
                        ]:
                            df = df[df.address ==
                                    self.prediction_address_select.value]

                #logger.warning('line 409 predict-df post filter:%s', df.head(20))
                # make table for display
                self.predict_df = pd.DataFrame({
                    'address': [],
                    'likely action': []
                })
                for target in list(self.targets['classification'].keys()):
                    if len(df) > 0:

                        df = self.normalize(df,
                                            timestamp_col='block_timestamp')
                        df = self.group_data(df,
                                             self.groupby_dict,
                                             timestamp_col='block_timestamp')
                        interest_labels = list(df['address'].unique())

                        # run model
                        df = df.fillna(0)
                        X = df[self.feature_list]
                        #logger.warning("df before prediction:%s",X.tail(10))
                        y_pred = self.pl[target].predict(X)
                        logger.warning('y_pred:%s', y_pred)
                        if target == 'churned':
                            y_pred_verbose = [
                                'remain' if x in ["active", 1] else "churn"
                                for x in y_pred
                            ]

                        #---- make table for display
                        self.predict_df = pd.DataFrame({
                            'address':
                            interest_labels,
                            'likely action':
                            y_pred_verbose
                        })

                        #------ label pools
                        self.predict_df['address'] = self.predict_df[
                            'address'].map(self.poolname_verbose_trun)
                        #logger.warning('self.predict_df:%s',self.predict_df)

                        churn_df = self.predict_df[
                            self.predict_df['likely action'] == 'churn']
                        perc_to_churn = round(
                            100 * len(churn_df) / len(self.predict_df), 1)
                        txt = target[:-2]
                        text = """<div {}>
                        <h3>Percentage likely to {}:</h3>
                        <strong 'style=color:black;'>{}%</strong></div>""".format(
                            self.header_style, txt, perc_to_churn)
                        self.metrics_div_update(data=perc_to_churn)
                    else:

                        text = """<div {}>
                            <br/> <h3>Sorry, address not found</h3>
                            </div>""".format(self.header_style)
                        self.metrics_div.text = text
                    logger.warning("end of %s predictions", target)
                return self.predict_df.hvplot.table(
                    columns=['address', 'likely action'],
                    width=500,
                    title='Account predictions')
            except Exception:
                logger.error("prediction:", exc_info=True)

        def make_tree(self, target='churned'):
            try:
                if not self.pl:
                    self.rf_clf()
                # Limit depth of tree to 3 levels
                # Extract the small tree
                tree_small = self.pl[target].named_steps['rf'].estimators_[5]
                # Save the tree as a png image
                export_graphviz(tree_small,
                                out_file='small_tree.dot',
                                feature_names=self.feature_list,
                                rounded=True,
                                precision=1)

                (graph, ) = pydot.graph_from_dot_file('small_tree.dot')
                # filepath = self.make_filepath('../../../static/images/small_tree.gif')
                # .write_png(filepath)
                filepath = self.make_filepath(
                    '/home/andre/Downloads/small_tree.png')
                graph.write_png(filepath)
                logger.warning("TREE SAVED")
            except Exception:
                logger.error("make tree:", exc_info=True)

        def make_feature_importances(self):
            try:
                if not self.pl:
                    self.rf_clf()

                results_dct = {
                    'outcome': [],
                    'feature': [],
                    'importance': [],
                    'rank_within_outcome': []
                }
                for target in self.targets['classification'].keys():
                    logger.warning('make feature importances for :%s', target)
                    # Get numerical feature importances
                    importances = list(
                        self.pl[target].named_steps['rf'].feature_importances_)

                    # List of tuples with variable and importance
                    feature_importances = [(feature, round(importance, 4))
                                           for feature, importance in zip(
                                               self.feature_list, importances)]

                    sorted_importances = sorted(feature_importances,
                                                key=itemgetter(1))

                    # logger.warning('importances :%s',importances)
                    # logger.warning("feature_importances:%s",feature_importances)
                    target_lst = [target] * len(importances)

                    count = 1
                    rank_lst = []
                    for i in importances:
                        rank_lst.append(count)
                        count += 1

                    results_dct['outcome'] += target_lst
                    results_dct['feature'] += [
                        i[0] for i in sorted_importances
                    ]
                    results_dct['importance'] += [
                        i[1] for i in sorted_importances
                    ]
                    results_dct['rank_within_outcome'] += sorted(rank_lst,
                                                                 reverse=True)

                df = pd.DataFrame.from_dict(results_dct)
                logger.warning('MAKE FEATURE IMPORTANCES FINISHED')
                return df.hvplot.table(
                    columns=[
                        'outcome', 'feature', 'importance',
                        'rank_within_outcome'
                    ],
                    width=600,
                    title="Variables ranked by importance (for each output)")

            except Exception:
                logger.error("Feature importances:", exc_info=True)

Exemple #12

0

Afficher le fichier

Fichier : projects.py Projet : andre-aion/analytics_demo

        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self, table, cols, dedup_cols)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')

            self.trigger = 0
            self.groupby_dict = {
                'project_duration': 'sum',
                'project_start_delay': 'mean',
                'project_end_delay': 'mean',
                'project_owner_age': 'mean',
                'project_owner_gender': 'mean',
                'milestone_duration': 'sum',
                'milestone_start_delay': 'mean',
                'milestone_end_delay': 'mean',
                'milestone_owner_age': 'mean',
                'milestone_owner_gender': 'mean',
                'task_duration': 'sum',
                'task_start_delay': 'sum',
                'task_end_delay': 'mean',
                'task_owner_age': 'mean',
                'task_owner_gender': 'mean'
            }
            self.feature_list = list(self.groupby_dict.keys())
            self.lag_variable = 'task_duration'
            self.lag_days = "1,2,3"
            self.lag = 0
            self.lag_menu = [str(x) for x in range(0, 100)]

            self.strong_thresh = .65
            self.mod_thresh = 0.4
            self.weak_thresh = 0.25
            self.corr_df = None
            self.div_style = """ 
                style='width:350px; margin-left:25px;
                border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
            """

            self.header_style = """ style='color:blue;text-align:center;' """

            self.variables = sorted(list(self.groupby_dict.keys()))
            self.variable = self.variables[0]

            self.relationships_to_check = ['weak', 'moderate', 'strong']

            self.status = 'all'
            self.pm_gender = 'all'
            self.m_gender = 'all'
            self.t_gender = 'all'
            self.type = 'all'

            self.pym = PythonMongo('aion')
            self.menus = {
                'status': ['all', 'open', 'closed'],
                'type': [
                    'all', 'research', 'reconciliation', 'audit', 'innovation',
                    'construction', 'manufacturing', 'conference'
                ],
                'gender': ['all', 'male', 'female'],
                'variables':
                list(self.groupby_dict.keys()),
                'history_periods':
                ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
            }
            self.multiline_vars = {'x': 'manager_gender', 'y': 'remuneration'}
            self.timestamp_col = 'project_startdate_actual'
            # ------- DIVS setup begin
            self.page_width = 1250
            txt = """<hr/>
                    <div style="text-align:center;width:{}px;height:{}px;
                           position:relative;background:black;margin-bottom:200px">
                           <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                    </div>""".format(self.page_width, 50, 'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=self.page_width, height=20),
                'bottom': Div(text=txt, width=self.page_width, height=10),
            }
            lag_section_head_txt = 'Lag relationships between {} and...'.format(
                self.variable)

            self.section_divider = '-----------------------------------'
            self.section_headers = {
                'lag':
                self.section_header_div(text=lag_section_head_txt,
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'distribution':
                self.section_header_div(text='Pre-transform distribution:',
                                        width=600,
                                        html_header='h2',
                                        margin_top=5,
                                        margin_bottom=-155),
                'relationships':
                self.section_header_div(
                    text='Relationships between variables:{}'.format(
                        self.section_divider),
                    width=600,
                    html_header='h2',
                    margin_top=5,
                    margin_bottom=-155),
                'correlations':
                self.section_header_div(text='Correlations:',
                                        width=600,
                                        html_header='h3',
                                        margin_top=5,
                                        margin_bottom=-155),
            }

Exemple #13

0

Afficher le fichier

Fichier : KPI_interface.py Projet : andre-aion/analytics_demo

    def __init__(self, table, name, cols):
        self.df = None
        self.ch = PythonClickhouse('aion')
        self.redis = PythonRedis()
        self.table = table
        self.cols = cols
        self.div_style = """ style='width:350px; margin-left:25px;
                                border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                """

        self.header_style = """ style='color:blue;text-align:center;' """
        self.welcome_txt = """<div style="text-align:center;background:black;width:100%;">
                                         <h1 style="color:#fff;">
                                         {}</h1></div>""".format('Welcome')
        css_path = join(dirname(__file__),
                        "../../../static/css/KPI_interface.css")
        self.KPI_card_css = KPI_card_css
        self.DATEFORMAT = '%Y-%m-%d %H:%M:%S'
        self.DATEFORMAT_PTD = '%Y-%m-%d'

        self.initial_date = datetime.strptime("2018-04-25 00:00:00",
                                              self.DATEFORMAT)
        self.account_type = 'all'
        self.trigger = -1
        self.periods_to_plot = {1: ['week', 'month'], 2: ['quarter']}
        self.pop_history_periods = 3  # number of periods for period over period
        self.pop_start_date = None
        self.pop_end_date = None
        self.timestamp_col = ''
        self.checkboxgroup = {}
        self.sig_effect_dict = {}
        self.name = name
        self.redis_stat_sig_key = 'adoption_features:' + self.name
        self.card_grid_row = {'year': 0, 'quarter': 1, 'month': 2, 'week': 3}
        weekly_pay = 1200
        num_engineers = 40
        self.payroll = {
            'week': weekly_pay * num_engineers,
            'month': weekly_pay * num_engineers * 4,
            'quarter': weekly_pay * num_engineers * 4 * 3,
            'year': weekly_pay * num_engineers * 4 * 3 * 4
        }
        self.resample_period = self.menus['resample_period'][0]

        self.groupby_dict = {
            'tw_mentions': 'sum',
            'tw_positive': 'mean',
            'tw_compound': 'mean',
            'tw_neutral': 'mean',
            'tw_negative': 'mean',
            'tw_emojis_positive': 'mean',
            'tw_emojis_compound': 'mean',
            'tw_emojis_negative': 'mean',
            'tw_emojis_count': 'sum',
            'tw_replies_from_followers': 'sum',
            'tw_replies_from_following': 'sum',
            'tw_reply_hashtags': 'sum'
        }

        self.pop_history_periods = 3  # number of periods for period over period
        self.variable = 'item'
        self.grouby_var = ''
        self.page_width = 1200

Exemple #14

0

Afficher le fichier

Fichier : KPI_interface.py Projet : andre-aion/analytics_demo

class KPI:
    menus = {
        'account_type':
        ['all', 'contract', 'miner', 'native_user', 'token_user'],
        'update_type': [
            'all', 'contract_deployment', 'internal_transfer', 'mined_block',
            'token_transfer', 'transaction'
        ],
        'history_periods': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
        'developer_adoption_DVs': ['aion_fork', 'aion_watch'],
        'resample_period': ['W', 'M', 'Q'],
        'social_media': ['twitter', 'facebook'],
        'social_media_variables': [
            'tw_mentions', 'tw_positive', 'tw_compound', 'tw_neutral',
            'tw_negative', 'tw_emojis_positive', 'tw_emojis_compound',
            'tw_emojis_negative', 'tw_emojis_count',
            'tw_replies_from_followers', 'tw_replies_from_following',
            'tw_reply_hashtags'
        ],
        'cryptos': ['all'] + load_cryptos(),
        'bcc': {
            'rental': ['area', 'category', 'item', 'status', 'gender']
        }
    }

    def __init__(self, table, name, cols):
        self.df = None
        self.ch = PythonClickhouse('aion')
        self.redis = PythonRedis()
        self.table = table
        self.cols = cols
        self.div_style = """ style='width:350px; margin-left:25px;
                                border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                """

        self.header_style = """ style='color:blue;text-align:center;' """
        self.welcome_txt = """<div style="text-align:center;background:black;width:100%;">
                                         <h1 style="color:#fff;">
                                         {}</h1></div>""".format('Welcome')
        css_path = join(dirname(__file__),
                        "../../../static/css/KPI_interface.css")
        self.KPI_card_css = KPI_card_css
        self.DATEFORMAT = '%Y-%m-%d %H:%M:%S'
        self.DATEFORMAT_PTD = '%Y-%m-%d'

        self.initial_date = datetime.strptime("2018-04-25 00:00:00",
                                              self.DATEFORMAT)
        self.account_type = 'all'
        self.trigger = -1
        self.periods_to_plot = {1: ['week', 'month'], 2: ['quarter']}
        self.pop_history_periods = 3  # number of periods for period over period
        self.pop_start_date = None
        self.pop_end_date = None
        self.timestamp_col = ''
        self.checkboxgroup = {}
        self.sig_effect_dict = {}
        self.name = name
        self.redis_stat_sig_key = 'adoption_features:' + self.name
        self.card_grid_row = {'year': 0, 'quarter': 1, 'month': 2, 'week': 3}
        weekly_pay = 1200
        num_engineers = 40
        self.payroll = {
            'week': weekly_pay * num_engineers,
            'month': weekly_pay * num_engineers * 4,
            'quarter': weekly_pay * num_engineers * 4 * 3,
            'year': weekly_pay * num_engineers * 4 * 3 * 4
        }
        self.resample_period = self.menus['resample_period'][0]

        self.groupby_dict = {
            'tw_mentions': 'sum',
            'tw_positive': 'mean',
            'tw_compound': 'mean',
            'tw_neutral': 'mean',
            'tw_negative': 'mean',
            'tw_emojis_positive': 'mean',
            'tw_emojis_compound': 'mean',
            'tw_emojis_negative': 'mean',
            'tw_emojis_count': 'sum',
            'tw_replies_from_followers': 'sum',
            'tw_replies_from_following': 'sum',
            'tw_reply_hashtags': 'sum'
        }

        self.pop_history_periods = 3  # number of periods for period over period
        self.variable = 'item'
        self.grouby_var = ''
        self.page_width = 1200

        # make block timestamp the index
    def load_df(self,
                start_date,
                end_date,
                cols,
                timestamp_col='timestamp_of_first_event',
                supplemental_where=None):
        try:

            if isinstance(end_date, date):
                end_date = datetime.combine(end_date, datetime.min.time())
            if isinstance(start_date, date):
                start_date = datetime.combine(start_date, datetime.min.time())
            end_date += timedelta(days=1)
            temp_cols = cols.copy()

            if self.table != 'external_daily':
                if 'amount' not in temp_cols:
                    temp_cols.append('amount')

            df = self.ch.load_data(self.table, temp_cols, start_date, end_date,
                                   timestamp_col, supplemental_where)
            # filter out the double entry
            #df = df[df['value'] >= 0]
            if len(cols) > 0:
                return df[cols]
            else:
                return df
            #df[timestamp_col] = df[timestamp_col].map(lambda x: clean_dates_from_db(x))
        except Exception:
            logger.error('load df', exc_info=True)

    def load_df_pym(self, req_startdate, req_enddate, table, cols,
                    timestamp_col):
        try:
            # get min and max of loaded df
            if self.df is not None:
                loaded_min = self.df[timestamp_col].min()
                loaded_max = self.df[timestamp_col].max()

                if loaded_min <= req_startdate and loaded_max >= req_enddate:
                    df = self.df[(self.df[timestamp_col] >= req_startdate)
                                 & (self.df[timestamp_col] <= req_enddate)]
                    return df
            return self.pym.load_df(req_startdate,
                                    req_enddate,
                                    table=table,
                                    cols=cols,
                                    timestamp_col=timestamp_col)

        except Exception:
            logger.error('load_df', exc_info=True)

    def update_cards(self, dct):
        try:
            txt = ''
            for period, data in dct.items():
                design = random.choice(list(KPI_card_css.keys()))
                title = period + ' to date'
                txt += self.card(title=title, data=data, card_design=design)

            text = """<div style="margin-top:100px;display:flex; flex-direction:row;">
                                                {}
                                                </div>""".format(txt)

            self.KPI_card_div.text = text

        except Exception:
            logger.error('update cards', exc_info=True)

    def reset_checkboxes(self, value='all', checkboxgroup=''):
        try:
            self.checkboxgroup[checkboxgroup].value = value
        except Exception:
            logger.error('reset checkboxes', exc_info=True)

    def first_date_in_quarter(self, timestamp):
        try:
            curr_quarter = int((timestamp.month - 1) / 3 + 1)
            return datetime(timestamp.year, 3 * curr_quarter - 2, 1)

        except Exception:
            logger.error('period to date', exc_info=True)

    def first_date_in_period(self, timestamp, period):
        try:
            if period == 'week':
                start = timestamp - timedelta(days=timestamp.weekday())
            elif period == 'month':
                start = datetime(timestamp.year, timestamp.month, 1, 0, 0, 0)
            elif period == 'year':
                start = datetime(timestamp.year, 1, 1, 0, 0, 0)
            elif period == 'quarter':
                start = self.first_date_in_quarter(timestamp)
            return start
        except Exception:
            logger.error('period to date', exc_info=True)

    def period_to_date(self,
                       df,
                       timestamp=None,
                       timestamp_filter_col=None,
                       cols=[],
                       period='week'):
        try:
            if timestamp is None:
                timestamp = datetime.now()
                timestamp = datetime(timestamp.year, timestamp.month,
                                     timestamp.day, timestamp.hour, 0, 0)

            start = self.first_date_in_period(timestamp, period)
            # filter
            if timestamp_filter_col is None:
                timestamp_filter_col = self.timestamp_col

            #logger.warning('df:%s',df[timestamp_filter_col])

            df = df[(df[timestamp_filter_col] >= start)
                    & (df[timestamp_filter_col] <= timestamp)]
            if len(cols) > 0:
                df = df[cols]
            return df
        except Exception:
            logger.error('period to date', exc_info=True)

    def label_qtr_pop(y):
        try:
            curr_quarter = int((y.month - 1) / 3 + 1)
            start = datetime(y.year, 3 * curr_quarter - 2, 1)
            return abs((start - y).days)
        except Exception:
            logger.error('df label quarter', exc_info=True)

    def shift_period_range(self, period, start, end):
        try:
            if period == 'week':
                start = start - timedelta(days=7)
                end = end - timedelta(days=7)
            elif period == 'month':
                start = start - relativedelta(months=1)
                end = end - relativedelta(months=1)
            elif period == 'year':
                start = start - relativedelta(years=1)
                end = end - relativedelta(years=1)
            elif period == 'quarter':
                start = start - relativedelta(months=3)
                end = end - relativedelta(months=3)
            #logger.warning('%s start:end=%s:%s',period,start,end)
            return start, end
        except Exception:
            logger.error('shift period range', exc_info=True)

    # label dates for period over period (pop)
    def label_dates_pop(self, df, period, timestamp_col):
        logger.warning('timestamp col:%s', df.head(10))

        def label_qtr_pop(y):
            try:
                curr_quarter = int((y.month - 1) / 3 + 1)
                start = datetime(y.year, 3 * curr_quarter - 2, 1)
                return abs((start - y).days)
            except Exception:
                logger.error('df label quarter', exc_info=True)

        try:
            if len(df) > 0:
                if period == 'week':
                    df = df.assign(
                        dayset=lambda x: x[timestamp_col].dt.dayofweek)
                elif period == 'month':
                    df = df.assign(dayset=lambda x: x[timestamp_col].dt.day)
                elif period == 'year':
                    df = df.assign(
                        dayset=lambda x: x[timestamp_col].dt.dayofyear)
                elif period == 'quarter':
                    df['dayset'] = df[timestamp_col].map(label_qtr_pop)

            return df
        except Exception:
            logger.error('label data ', exc_info=True)

    def pop_include_zeros(self, df_period, plotcols, period):
        try:
            # check for no data on original dates
            tmp_title = '0 {}(s) prev(current)'.format(period)
            if tmp_title not in plotcols:
                df_period[tmp_title] = [0] * len(df_period)
                plotcols.append(tmp_title)

                logger.warning('line 218 cols to plot:%s', plotcols)
            # do other periods
            tmp = plotcols[0]
            txt = tmp[1:]
            if isinstance(self.pop_history_periods, str):
                self.pop_history_periods = int(self.pop_history_periods)
            for i in range(1, self.pop_history_periods):
                tmp_txt = str(i) + txt
                if tmp_txt not in plotcols:
                    df_period[tmp_txt] = [0] * len(df_period)
                    plotcols.append(tmp_txt)

            logger.warning('LINE 158 plotcols at end of pop include zeros:%s',
                           plotcols)

            return df_period, sorted(plotcols)
        except Exception:
            logger.error('pop include zeros', exc_info=True)

    def period_over_period(self,
                           df,
                           start_date,
                           end_date,
                           period,
                           history_periods=2,
                           timestamp_col='timestamp_of_first_event'):
        try:
            # filter cols if necessary
            string = '0 {}(s) prev(current)'.format(period)

            # filter out the dates greater than today
            df_current = df.assign(period=string)
            # label the days being compared with the same label
            if len(df_current) > 0:
                df_current = self.label_dates_pop(df_current, period,
                                                  timestamp_col)

            # zero out time information
            start = datetime(start_date.year, start_date.month, start_date.day,
                             0, 0, 0)
            end = datetime(end_date.year, end_date.month, end_date.day, 0, 0,
                           0)

            cols = list(df.columns)
            logger.warning(' Line 293 %s:df %s', period, df.head(10))
            logger.warning(' Line 293 %s:df cols %s', period, cols)

            counter = 1
            if isinstance(history_periods, str):
                history_periods = int(history_periods)
            # make dataframes for request no. of periods
            start, end = self.shift_period_range(period, start, end)
            while counter < history_periods and start >= self.initial_date:
                # load data
                if period == 'quarter':
                    logger.warning('start:end %s:%s', start, end)
                if 'bcc' in self.table:
                    df_temp = self.load_df_pym(start, end, cols, timestamp_col)
                else:
                    df_temp = self.load_df(start, end, cols, timestamp_col)
                if df_temp is not None:
                    if len(df_temp) > 1:
                        string = '{} {}(s) prev'.format(counter, period)
                        # label period
                        df_temp = df_temp.assign(period=string)
                        # relabel days to get matching day of week,doy, dom, for different periods
                        df_temp = self.label_dates_pop(df_temp, period,
                                                       timestamp_col)
                        #logger.warning('df temp loaded for %s previous: %s',counter,len(df_temp))

                        df_current = concat_dfs(df_current, df_temp)
                        del df_temp
                        gc.collect()

                # shift the loading window
                counter += 1
                start, end = self.shift_period_range(period, start, end)
                if period == 'week':
                    logger.warning('LINE 327 df_current:%s',
                                   df_current.head(10))

            return df_current
        except Exception:
            logger.error('period over period', exc_info=True)

    def pop_week(self, launch=-1):
        try:
            return self.graph_period_over_period('week')
        except Exception:
            logger.error('pop week', exc_info=True)

    def pop_month(self, launch=-1):
        try:
            return self.graph_period_over_period('month')
        except Exception:
            logger.error('pop month', exc_info=True)

    def pop_quarter(self, launch=-1):
        try:
            return self.graph_period_over_period('quarter')
        except Exception:
            logger.error('pop quarter', exc_info=True)

    def pop_year(self, launch=-1):
        try:
            return self.graph_period_over_period('year')
        except Exception:
            logger.error('pop year', exc_info=True)

    """
     To enable comparision across period, dates must have label relative to period start.
     Place dates in columns to be able to plot multi-line/bar graphs
     
    """

    def split_period_into_columns(self, df, col_to_split, value_to_copy):
        try:
            for item in df[col_to_split].unique():
                df[item] = df.apply(lambda x: x[value_to_copy]
                                    if x[col_to_split] == item else 0,
                                    axis=1)
            #logger.warning('split period into columns:%s', df.head(10))
            return df
        except Exception:
            logger.error('split period into column', exc_info=True)

    # -----------------------  UPDATERS  ------------------------------------------
    def card(self, title, data, width=200, height=200, card_design='folders'):
        try:
            txt = """
            <div style="flex: 1 1 0px;border: 1px solid black;{};width:{}px;
                        height:{}px;border-right=10px;">
                <h3>
                    {}
                </h3>
                </br>
                {}
            </div>""".format(self.KPI_card_css[card_design], width, height,
                             title, data)
            return txt
        except Exception:
            logger.error('card', exc_info=True)

    def notification_updater(self, text):
        txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                              position:relative;background:black;">
                              <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                        </div>""".format(self.page_width, 50, text)
        for key in self.notification_div.keys():
            self.notification_div[key].text = txt

    """
        update the section labels on the page

    """

    def section_header_updater(self, section, label='all'):
        if label not in ['all', '', 'remuneration']:
            label = label + 's'
        if section == 'cards':
            text = "Period to date:"
            if label == 'remuneration':
                text = text + '$ spent'
            if label == 'project':
                text = text + '# of projects'
            if label == 'delay_start':
                text = text + 'Mean delay in start projects(hours)'
            if label == 'delay_end':
                text = text + 'Mean project overrun(hours)'
            if label == 'project_duration':
                text = text + 'Mean project duration (days)'
            if label == 'task_duration':
                text = text + 'Total project person hours)'
        elif section == 'pop':
            text = "Period over period:{}".format(label)

        txt = """<h2 style="color:#4221cc;">{}-----------------------------------------------------------------</h2>"""\
            .format(text)
        self.section_headers[section].text = txt

    # -------------------- CALCULATE KPI's DEVELOPED FROM VARIABLES WITH STATITICALLY SIGNIFICANT EFFECT
    def card_text(self, title, data, card_design='folders'):
        try:
            txt = """
            <div {}>
            <h3>{}</h3></br>{}
            </div>
            """.format(self.KPI_card_css[card_design], title, data)
            return txt
        except Exception:
            logger.error('card text', exc_info=True)

    def match_sigvars_to_coin_vars(self, df, interest_var):
        try:
            # load statistically significant variables
            key = self.redis_stat_sig_key + '-' + interest_var
            # adjust the variable of interest to match the key
            key_vec = key.split(
                '-')  # strip the crypto name off of he variable
            gen_variables = [
                'release', 'watch', 'push', 'issue', 'fork', 'open', 'high',
                'low', 'close', 'volume', 'market_cap'
            ]
            for var in gen_variables:
                if var in key_vec[-1]:
                    key = key_vec[-2] + '-' + var
                    break

            sig_variables = self.redis.simple_load(key)
            self.sig_effect_dict = {}
            significant_features = {}
            # make a list of columns with names that include the significant feature
            if sig_variables is not None:
                if 'features' in sig_variables.keys():
                    if len(sig_variables['features']) > 0:
                        for col in df.columns:
                            if any(var in col
                                   for var in sig_variables['features']):
                                significant_features[col] = 'sum'
            return significant_features
        except Exception:
            logger.error('match sig vars to coin vars', exc_info=True)

    def calc_sig_effect_card_data(self, df, interest_var, period):
        try:

            significant_features = self.match_sigvars_to_coin_vars(
                df, interest_var=interest_var)
            if len(significant_features) > 0:
                cols = [interest_var] + list(significant_features.keys())
                tmp_df = df[cols]
                numer = tmp_df[interest_var].sum()

                variable_of_interest_tmp = interest_var.split('_')
                if variable_of_interest_tmp[-1] in ['watch']:
                    variable_of_interest_tmp[-1] += 'e'
                txt = ''
                for var in significant_features.keys():
                    point_estimate = 0
                    var_tmp = var.split(
                        '_')  # slice out the 'fork' from 'aion_fork'
                    if numer != 0:
                        denom = tmp_df[var].sum()
                        point_estimate = '*'
                        if denom != 0:
                            point_estimate = round(numer / denom, 3)
                    # add metrics based on variables
                    # update the divs
                    self.sig_effect_dict[var] = {
                        'title':
                        "{}s per {}".format(variable_of_interest_tmp[-1],
                                            var_tmp[-1]),
                        'point_estimate':
                        point_estimate
                    }

                    txt += self.card(
                        title=self.sig_effect_dict[var]['title'],
                        data=self.sig_effect_dict[var]['point_estimate'],
                        card_design=random.choice(
                            list(self.KPI_card_css.keys())))

                    return txt

        except Exception:
            logger.error('make sig effect columns', exc_info=True)

    def update_significant_DV_cards(self, dct):
        try:
            txt = ''
            for idx, period in enumerate(dct.keys()):
                txt += dct[period]
            text = """<div style="margin-top:100px;display:flex; flex-direction:column;">
                                            {}
                       </div>""".format(txt)

            self.KPI_card_div.text = text

        except Exception:
            logger.error('update cards', exc_info=True)

    def payroll_to_date(self, period):
        try:
            # make data cards
            # number of weeks in period
            if period == 'year':
                weekcount = datetime.now().isocalendar()[1]
                payroll_to_date = self.payroll['week'] * weekcount
            elif period == 'week':
                payroll_to_date = self.payroll['week'] * (
                    datetime.today().weekday() / 7)
            elif period == 'month':
                weekcount = floor(
                    datetime.today().day / 7) + 1  # no zero week allowed
                payroll_to_date = self.payroll['week'] * weekcount
            elif period == 'quarter':
                start = self.first_date_in_quarter(datetime.today())
                weekcount = floor(
                    (abs(datetime.today() - start).days + 1) / 7) + 1
                payroll_to_date = self.payroll['week'] * weekcount

            return round(payroll_to_date, 2)
        except Exception:
            logger.error('payroll to date', exc_info=True)

    """
        groupby the the data and make ratios between 
        significant variables and interest variables
    """

    def make_significant_ratios_df(self, df, resample_period, interest_var,
                                   timestamp_col):
        try:

            def ratio(df, col_old, col_new):
                df = df.assign(result=df[interest_var] / df[col_old])
                df = df.rename(columns={'result': col_new})
                #logger.warning('col-%s df:%s',col_old,df.head(5))

                return df

            # filter
            sig_features_dict = self.match_sigvars_to_coin_vars(
                df, interest_var)
            sig_features_dict[
                interest_var] = 'sum'  # include interest var in aggregations
            sig_features_list = list(sig_features_dict.keys())
            # rename column for overwriting
            sig_vars_relabel = []
            for feature in sig_features_list:
                tmp = feature.split('_')
                sig_vars_relabel.append(tmp[-1])
            # groupby
            df = df.set_index(timestamp_col)

            df = df.resample(resample_period).agg(sig_features_dict)
            #logger.warning('LINE 413:%s',len(df))

            # create ratios
            for idx, col in enumerate(sig_features_list):
                if col != interest_var:  # skip variable of interest
                    df = df.map_partitions(ratio, col, sig_vars_relabel[idx])

            # drop columns
            df = df.drop(sig_features_list, axis=1)
            df = df.fillna(0)
            return df
        except Exception:
            logger.error('significant ratios', exc_info=True)

Exemple #15

0

Afficher le fichier

Fichier : mytab_interface.py Projet : andre-aion/analytics_demo

class Mytab:
    def __init__(self, table, cols, dedup_cols, panel_title=None):
        self.panel_title = panel_title
        self.table = table
        self.load_params = dict()
        self.cols = cols
        self.locals = dict()  # stuff local to each tab
        self.streaming_dataframe = SD(table, cols, dedup_cols)
        self.df = self.streaming_dataframe.df
        self.df1 = None
        self.dedup_cols = dedup_cols
        self.params = None
        self.load_params = None
        self.poolname_dict = self.get_poolname_dict()
        self.key_tab = ''  # for key composition in redis
        self.construction_tables = {}
        self.tier1_miners_list = []
        self.tier2_miners_list = []
        self.pq = PythonParquet()
        self.ch = PythonClickhouse('aion')
        self.redis = PythonRedis()
        self.conn = self.redis.conn
        self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
        self.ToA_THRESH = {  # Tests of association (TOA)
            'STRONG': .65,
            'MODERATE': .4,
            'WEAK': .25
        }
        self.menus = {'resample_periods': ['D', 'W', 'M', 'Q']}
        self.resample_period = self.menus['resample_periods'][0]
        self.pvalue_thresh = 0.1

        self.page_width = 1200

    # designed to work with premade warehouse table
    def df_load(self,
                req_start_date,
                req_end_date,
                timestamp_col='block_timestamp',
                cols=[],
                supplemental_where=None):
        params = {'start': False, 'end': False}

        try:
            if self.df is not None:
                if len(self.df) > 0:
                    # if in memory simply filter
                    params['min_date'], params['max_date'] = \
                        dd.compute(self.df[timestamp_col].min(), self.df[timestamp_col].max())

                    for key in params.keys():
                        if isinstance(params[key], date):
                            params[key] = datetime.combine(
                                params[key], datetime.min.time())

                    req_start_date = pd.to_datetime(req_start_date)
                    req_end_date = pd.to_datetime(req_end_date)

                    # check start
                    #logger.warning('start_date from compute:%s', params['min_date'])
                    #logger.warning('start from slider:%s', req_start_date)

                    # set flag to true if data is in memory
                    if req_start_date >= params['min_date']:
                        params['start'] = True
                    if req_end_date <= params['max_date']:
                        params['end'] = True

            # entire frame in memory
            key_params = [self.table, self.key_tab]

            if params['start'] and params['end']:
                self.filter_df(req_start_date, req_end_date)
                logger.warning("DF LOADED FROM MEMORY:%s", self.table)
            # no part in memory, so construct/load from clickhouse
            mintime = time(00, 00, 00)
            if isinstance(req_end_date, date):
                req_end_date = datetime.combine(req_end_date, mintime)
            if isinstance(req_start_date, date):
                req_start_date = datetime.combine(req_start_date, mintime)
            req_end_date = req_end_date + timedelta(
                days=1)  #move end_date to midnite

            self.df = self.ch.load_data(self.table,
                                        cols,
                                        req_start_date,
                                        req_end_date,
                                        timestamp_col=timestamp_col,
                                        supplemental_where=supplemental_where)
            self.filter_df(req_start_date, req_end_date)
            #logger.warning("%s LOADED: %s:%s",self.table,req_start_date,req_end_date)

        except Exception:
            logger.error("df_load:", exc_info=True)

    def str_to_date(self, x):
        if isinstance(x, str):
            logger.warning("STR TO DATETIME CONVERSION:%s", x)
            return datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
        return x

    def filter_df(self, start_date, end_date):
        self.df1 = self.df

    def divide_by_day_diff(self, x):
        y = x / self.day_diff
        logger.warning('Normalization:before:%s,after:%s', x, y)
        return y

    def normalize(self, df, timestamp_col):
        try:
            logger.warning('timestamp col in normalize:%s', timestamp_col)
            min_date, max_date = dd.compute(df[timestamp_col].min(),
                                            df[timestamp_col].max())
            day_diff = abs((max_date - min_date).days)
            if day_diff > 0:
                for col in df.columns:
                    if isinstance(col, int) or isinstance(col, float):
                        logger.warning("NORMALIZATION ONGOING FOR %s", col)
                        df[col] = df[col].map(self.divide_by_day_diff)
            logger.warning("NORMALIZATION ended for day-diff:%s days",
                           day_diff)
            return df
        except Exception:
            logger.error('normalize:', exc_info=True)

    def get_poolname_dict(self):
        file = join(dirname(__file__), '../../../data/poolinfo.csv')
        df = pd.read_csv(file)
        a = df['address'].tolist()
        b = df['poolname'].tolist()
        poolname_dict = dict(zip(a, b))
        return poolname_dict

    def poolname_verbose(self, x):
        # add verbose poolname
        if x in self.poolname_dict.keys():
            return self.poolname_dict[x]
        return x

    def poolname_verbose_trun(self, x):
        if x in self.poolname_dict.keys():
            return self.poolname_dict[x]
        else:
            if len(x) > 10:
                return x[0:10]
        return x

    def notification_updater(self, text):
        txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                              position:relative;background:black;">
                              <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                        </div>""".format(self.page_width, 50, text)
        for key in self.notification_div.keys():
            self.notification_div[key].text = txt

    def group_data(self, df, groupby_dict={}, timestamp_col='block_timestamp'):
        # normalize columns by number of days under consideration
        df = self.normalize(df, timestamp_col)
        df = df.groupby([self.interest_var]).agg(groupby_dict).compute()

        df = df.reset_index()
        if 'index' in df.columns.tolist():
            df = df.drop('index', axis=1)
        df = df.fillna(0)
        # logger.warning('df after groupby:%s', self.df.head(10))

        return df

    def divide_by_day_diff(self, x):
        y = x / self.day_diff
        logger.warning('Normalization:before:%s,after:%s', x, y)
        return y

    def make_filepath(self, path):
        return join(dirname(__file__), path)

    # ######################################################

    def is_in_memory(self, table, req_start_date, req_end_date):
        str_req_start_date = datetime.strftime(req_start_date, '%Y-%m-%d')
        str_req_end_date = datetime.strftime(req_end_date, '%Y-%m-%d')
        logger.warning('set_load_params-req_start_date:%s', str_req_start_date)
        logger.warning('set_load_params-req_end_date:%s', str_req_end_date)
        params = dict()
        params['in_memory'] = False
        params['key'] = None

        try:
            # get keys
            str_to_match = '*' + table + ':*'
            matches = self.conn.scan_iter(match=str_to_match)

            if matches:
                for redis_key in matches:
                    redis_key_encoded = redis_key
                    redis_key = str(redis_key, 'utf-8')
                    logger.warning('redis_key:%s', redis_key)

                    redis_key_list = redis_key.split(':')
                    logger.warning('matching keys :%s', redis_key_list)
                    # convert start date in key to datetime
                    key_start_date = datetime.strptime(redis_key_list[-2],
                                                       '%Y-%m-%d')
                    key_end_date = datetime.strptime(redis_key_list[-1],
                                                     '%Y-%m-%d')

                    # check to see if there is all data to be retrieved from reddis
                    logger.warning('req_start_date:%s', req_start_date)
                    logger.warning('key_start_date:%s', key_start_date)

                    # matches to delete: make a list
                    if req_start_date >= key_start_date and req_end_date <= key_end_date:
                        """
                        required_df      || ---------------- ||
                        redis_df  | -------------------------------|
                        """
                        params['in_memory'] = True
                        params['key'] = redis_key_encoded
                        break
            return params
        except Exception:
            logger.error("is_in_memory", exc_info=True)
            params = dict()
            params['in_memory'] = False
            params['key'] = None
            return params

    # ///////////////////////// TESTS OF ASSOCIATION ///////////////////////////
    # perform correlation, and label according to r,pvalue
    def corr_label(self, a, b):
        try:
            slope, intercept, rvalue, pvalue, std_err = linregress(a, b)
            logger.warning(
                'slope:%s,intercept:%s,rvalue:%s,pvalue:%s,std_err:%s', slope,
                intercept, rvalue, pvalue, std_err)
            if pvalue < self.pvalue_thresh:
                if abs(rvalue) <= self.ToA_THRESH['WEAK']:
                    txt = 'none'
                else:
                    strength = 'weak'
                    if rvalue > 0:
                        direction = 'positive'
                    if rvalue < 0:
                        direction = 'negative'
                    if abs(rvalue) > self.ToA_THRESH['STRONG']:
                        strength = 'strong'
                    elif abs(rvalue) > self.ToA_THRESH['MODERATE']:
                        strength = 'moderate'

                    txt = "{} {}".format(strength, direction)
            else:
                txt = 'Not significant'

            return slope, intercept, rvalue, pvalue, txt
        except Exception:
            logger.error('corr label', exc_info=True)

    def mann_whitneyu_label(self, a, b):
        try:
            stat, pvalue = mannwhitneyu(a, b, alternative='two-sided')
            logger.warning('stat:%s,pvalue:%s', stat, pvalue)
            if pvalue < self.pvalue_thresh:
                txt = 'No'
            else:
                txt = 'Yes'

            return stat, pvalue, txt
        except Exception:
            logger.error('non parametric label', exc_info=True)

    def mutual_information_label(self, a, b):
        try:
            stat = linregress(a, b)
            logger.warning('stat:%s', stat)
            if abs(stat) > self.ToA_THRESH['STRONG']:
                strength = 'strong'
            elif abs(stat) > self.ToA_THRESH['MODERATE']:
                strength = 'moderate'
            elif abs(stat) > self.ToA_THRESH['WEAK']:
                strength = 'weak'
            else:
                strength = 'None'

            txt = strength
            return stat, txt
        except Exception:
            logger.error('corr label', exc_info=True)