def construct_from_redis(key_lst,
                         item_type='list',
                         df=None,
                         table=None,
                         df_cols=None,
                         dedup_cols=None):
    try:
        redis = PythonRedis()
        if not key_lst:
            return None
        else:
            temp_item = [] if item_type == 'list' else df
            start_list = []
            end_list = []
            for key in key_lst:
                # create df if necessary
                if item_type == 'list':
                    item_loaded = redis.load([], '', '', key, item_type)
                    temp_item.append(item_loaded)
                    return temp_item
                else:
                    #make list of start and end dates from list

                    # get key dates
                    logger.warning('key for churned load:%s', key)

                    lst = key.split(':')
                    if lst[-1] != '':
                        req_start_date = datetime.strptime(
                            lst[-2] + ' 00:00:00', '%Y-%m-%d %H:%M:%S')
                        req_end_date = datetime.strptime(
                            lst[-1] + ' 00:00:00', '%Y-%m-%d %H:%M:%S')
                        #req_start_date = datetime.combine(req_start_date, datetime.min.time())
                        #req_end_date = datetime.combine(req_end_date, datetime.min.time())

                        start_list.append(req_start_date)
                        end_list.append(req_end_date)

            tab = Mytab('block_tx_warehouse',
                        cols['block_tx_warehouse']['models'], [])
            if len(start_list) > 0:
                if item_type != 'list':
                    # if warehouse get minimum start date and maximum end data, and retrive from database
                    tab.key_tab = 'models'
                    req_start_date = min(start_list)
                    req_end_date = max(end_list)
                    tab.df_load(req_start_date, req_end_date)
                    logger.warning('TRACKER:%s', tab.df.tail(10))

                    return tab.df1
            else:
                return tab.df1
    except Exception:
        logger.error("construct from redis/clickhouse", exc_info=True)
    def __init__(self, table, cols, dedup_cols, panel_title=None):
        self.panel_title = panel_title
        self.table = table
        self.load_params = dict()
        self.cols = cols
        self.locals = dict()  # stuff local to each tab
        self.streaming_dataframe = SD(table, cols, dedup_cols)
        self.df = self.streaming_dataframe.df
        self.df1 = None
        self.dedup_cols = dedup_cols
        self.params = None
        self.load_params = None
        self.poolname_dict = self.get_poolname_dict()
        self.key_tab = ''  # for key composition in redis
        self.construction_tables = {}
        self.tier1_miners_list = []
        self.tier2_miners_list = []
        self.pq = PythonParquet()
        self.ch = PythonClickhouse('aion')
        self.redis = PythonRedis()
        self.conn = self.redis.conn
        self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
        self.ToA_THRESH = {  # Tests of association (TOA)
            'STRONG': .65,
            'MODERATE': .4,
            'WEAK': .25
        }
        self.menus = {'resample_periods': ['D', 'W', 'M', 'Q']}
        self.resample_period = self.menus['resample_periods'][0]
        self.pvalue_thresh = 0.1

        self.page_width = 1200
Example #3
0
from scripts.databases.pythonRedis import PythonRedis
from scripts.utils.mylogger import mylogger

logger = mylogger(__file__)
redis = PythonRedis()


Example #4
0
def crypto_clusters_eda_tab(cryptos, panel_title):
    global groupby_dict
    global features
    global cluster_dct
    #global source

    redis = PythonRedis()
    cluster_dct = redis.simple_load('clusters:cryptocurrencies')
    if cluster_dct is not None:
        groupby_dict = {}
        for var in cluster_dct['features']:
            groupby_dict[var] = 'sum'

        features = cluster_dct['features']
        source = {}
        for feature in features:
            source[feature] = ColumnDataSource(
                data=dict(xs=[], ys=[], labels=[], colors=[]))

    class Thistab(Mytab):
        def __init__(self, table, cols, dedup_cols=[]):
            Mytab.__init__(self,
                           table,
                           cols,
                           dedup_cols,
                           panel_title=panel_title)
            self.table = table
            self.cols = cols
            self.DATEFORMAT = "%Y-%m-%d %H:%M:%S"
            self.df = None
            self.df1 = None
            self.df_predict = None
            self.day_diff = 1  # for normalizing for classification periods of different lengths
            self.df_grouped = ''

            self.cl = PythonClickhouse('aion')
            self.items = cryptos
            # add all the coins to the dict
            self.github_cols = [
                'watch', 'fork', 'issue', 'release', 'push', 'tw_mentions',
                'tw_positive', 'tw_compound', 'tw_neutral', 'tw_negative',
                'tw_emojis_positive', 'tw_emojis_compound',
                'tw_emojis_negative', 'tw_emojis_count', 'tw_reply_hashtags'
            ]
            self.index_cols = ['close', 'high', 'low', 'market_cap', 'volume']

            self.trigger = 0
            txt = """<div style="text-align:center;background:black;width:100%;">
                                                                           <h1 style="color:#fff;">
                                                                           {}</h1></div>""".format(
                'Welcome')
            self.notification_div = {
                'top': Div(text=txt, width=1400, height=20),
                'bottom': Div(text=txt, width=1400, height=10),
            }
            self.cluster_dct = cluster_dct
            self.groupby_dict = groupby_dict
            self.features = features
            self.crypto = 'all'

            self.div_style = """ style='width:350px; margin-left:25px;
                                    border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                    """

            self.header_style = """ style='color:blue;text-align:center;' """

            self.significant_effect_dict = {}
            self.df1 = None
            self.section_headers = {
                'ts':
                self.section_header_div(
                    'Comparison of clusters across variables:---------------------',
                    width=600)
            }
            self.timestamp_col = None
            self.colors = None

        # ----------------------  DIVS ----------------------------
        def section_header_div(self, text, html_header='h2', width=1400):
            text = '<{} style="color:#4221cc;">{}</{}>'.format(
                html_header, text, html_header)
            return Div(text=text, width=width, height=15)

        def information_div(self, width=400, height=300):
            txt = """
               <div {}>
               <h4 {}>How to interpret relationships </h4>
               <ul style='margin-top:-10px;'>
                   <li>
                   </li>
                   <li>
                   </li>
                   <li>
                   </li>
                   <li>
                   </li>
                    <li>
                   </li>
                    <li>
                   </li>
               </ul>
               </div>

               """.format(self.div_style, self.header_style)
            div = Div(text=txt, width=width, height=height)
            return div

        # ////////////////////////// UPDATERS ///////////////////////
        def section_head_updater(self, section, txt):
            try:
                self.section_header_div[section].text = txt
            except Exception:
                logger.error('', exc_info=True)

        def notification_updater(self, text):
            txt = """<div style="text-align:center;background:black;width:100%;">
                    <h4 style="color:#fff;">
                    {}</h4></div>""".format(text)
            for key in self.notification_div.keys():
                self.notification_div[key].text = txt

        # /////////////////////////// LOAD CLUSTERS  //////////////////////
        def prep_data(self, df, timestamp_col):
            def label_cluster(x):
                for key, values in self.cluster_dct.items():
                    if key not in ['timestamp', 'variables']:
                        if x in values:
                            return key
                return x

            try:
                cols = self.features + ['crypto', 'timestamp']
                df = df[cols]
                # groupby and resample
                df['crypto'] = df['crypto'].map(lambda x: label_cluster(x))
                df = df.rename(columns={'crypto': 'cluster'})
                df = df.compute()
                df[timestamp_col] = pd.to_datetime(df[timestamp_col],
                                                   errors='coerce')
                df.set_index(timestamp_col, inplace=True)
                df = df.groupby('cluster').resample(self.resample_period).agg(
                    self.groupby_dict)
                df.reset_index(inplace=True)
                df.set_index(timestamp_col, inplace=True)
                self.timestamp_col = timestamp_col
                self.df1 = df

            except Exception:
                logger.error('prep data', exc_info=True)

        def graph_ts(self):
            try:
                #global source
                if self.df1 is not None:
                    df = self.df1.copy()
                    clusters = df['cluster'].unique()
                    self.colors = [''] * len(clusters)
                    for idx, feature in enumerate(clusters):
                        self.colors[idx] = dashboard_config['colors'][idx]
                    if self.features is not None:
                        for idx, feature in enumerate(self.features):
                            df1 = df[['cluster', feature]]
                            # pivot into columns for cluster
                            df1 = df1.pivot(columns='cluster')
                            data = dict(x=[df1.index.values] * len(clusters),
                                        y=[df1[name].values for name in df1],
                                        labels=clusters,
                                        colors=self.colors)
                            source[feature].data = data
            except Exception:
                logger.error('graph ts', exc_info=True)

        def graph_chartify(self, timestamp_col):
            try:
                # global source
                if self.df1 is not None:
                    df = self.df1.copy()
                    df = df.reset_index()

                    for feature in self.features:
                        ch = chartify.Chart(blank_labels=True,
                                            x_axis_type='datetime')
                        ch.set_title("CHARTIFY")
                        ch.plot.line(
                            # Data must be sorted by x column
                            data_frame=df.sort_values(timestamp_col),
                            x_column=timestamp_col,
                            y_column=feature,
                            color_column='cluster')
                        return ch

            except Exception:
                logger.error('graph chartify', exc_info=True)

    def update():
        thistab.notification_updater(
            "Calculations underway. Please be patient")
        thistab.df_load(datepicker_start.value,
                        datepicker_end.value,
                        timestamp_col='timestamp')
        thistab.prep_data(thistab.df, 'timestamp')
        thistab.graph_ts()
        thistab.notification_updater("Ready!")

    def update_resample(attrname, old, new):
        thistab.notification_updater(
            "Calculations underway. Please be patient")
        thistab.resample_period = resample_select.value
        thistab.prep_data(thistab.df, 'timestamp')
        thistab.graph_ts()
        thistab.notification_updater("ready")

    try:
        table = 'external_daily'
        thistab = Thistab(table, [], [])

        # setup dates
        first_date_range = datetime.strptime("2018-04-25 00:00:00",
                                             "%Y-%m-%d %H:%M:%S")
        last_date_range = datetime.now().date()
        last_date = dashboard_config['dates']['last_date'] - timedelta(days=2)
        first_date = dashboard_config['dates']['current_year_start']
        # initial function call
        thistab.df_load(first_date,
                        last_date,
                        timestamp_col='timestamp',
                        cols=[])
        thistab.prep_data(thistab.df, timestamp_col='timestamp')

        # MANAGE STREAMS ---------------------------------------------------------

        # CREATE WIDGETS ----------------------------------------------------------------
        datepicker_start = DatePicker(title="Start",
                                      min_date=first_date_range,
                                      max_date=last_date_range,
                                      value=first_date)

        datepicker_end = DatePicker(title="End",
                                    min_date=first_date_range,
                                    max_date=last_date_range,
                                    value=last_date)

        load_dates_button = Button(
            label="Select dates/periods, then click me!",
            width=20,
            height=8,
            button_type="success")

        resample_select = Select(title='Select summary period',
                                 value=thistab.resample_period,
                                 options=thistab.menus['resample_periods'])

        # -------------------------------- PLOTS ---------------------------
        thistab.graph_ts()
        p = {}
        for feature in features:
            p[feature] = figure(x_axis_type="datetime",
                                plot_width=1400,
                                plot_height=400,
                                title=feature)

            p[feature].multi_line(
                xs='x',
                ys='y',
                legend='labels',
                line_color='colors',
                line_width=5,
                hover_line_color='colors',
                hover_line_alpha=1.0,
                source=source[feature],
            )
            p[feature].add_tools(
                HoverTool(show_arrow=False,
                          line_policy='next',
                          tooltips=[
                              ('freq', '$y'),
                          ]))

        # ch = thistab.graph_chartify(timestamp_col='timestamp')
        # -------------------------------- CALLBACKS ------------------------

        load_dates_button.on_click(update)  # lags array
        resample_select.on_change('value', update_resample)

        # -----------------------------------LAYOUT ----------------------------
        # COMPOSE LAYOUT
        # put the controls in a single element
        controls_left = WidgetBox(datepicker_start, load_dates_button)

        controls_right = WidgetBox(datepicker_end)

        grid_data = [
            #[ch.figure],
            [thistab.notification_div['top']],
            [controls_left, controls_right],
            [thistab.section_headers['ts'], resample_select],
        ]
        for feature in features:
            grid_data.append([p[feature]])
            logger.warning('p:%s', p[feature])

        grid_data.append([thistab.notification_div['bottom']])

        grid = gridplot(grid_data)

        # Make a tab with the layout
        tab = Panel(child=grid, title=thistab.panel_title)
        return tab

    except Exception:
        logger.error('rendering err:', exc_info=True)
        return tab_error_flag(thistab.panel_title)
    def __init__(self, table, name, cols):
        self.df = None
        self.ch = PythonClickhouse('aion')
        self.redis = PythonRedis()
        self.table = table
        self.cols = cols
        self.div_style = """ style='width:350px; margin-left:25px;
                                border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                """

        self.header_style = """ style='color:blue;text-align:center;' """
        self.welcome_txt = """<div style="text-align:center;background:black;width:100%;">
                                         <h1 style="color:#fff;">
                                         {}</h1></div>""".format('Welcome')
        css_path = join(dirname(__file__),
                        "../../../static/css/KPI_interface.css")
        self.KPI_card_css = KPI_card_css
        self.DATEFORMAT = '%Y-%m-%d %H:%M:%S'
        self.DATEFORMAT_PTD = '%Y-%m-%d'

        self.initial_date = datetime.strptime("2018-04-25 00:00:00",
                                              self.DATEFORMAT)
        self.account_type = 'all'
        self.trigger = -1
        self.periods_to_plot = {1: ['week', 'month'], 2: ['quarter']}
        self.pop_history_periods = 3  # number of periods for period over period
        self.pop_start_date = None
        self.pop_end_date = None
        self.timestamp_col = ''
        self.checkboxgroup = {}
        self.sig_effect_dict = {}
        self.name = name
        self.redis_stat_sig_key = 'adoption_features:' + self.name
        self.card_grid_row = {'year': 0, 'quarter': 1, 'month': 2, 'week': 3}
        weekly_pay = 1200
        num_engineers = 40
        self.payroll = {
            'week': weekly_pay * num_engineers,
            'month': weekly_pay * num_engineers * 4,
            'quarter': weekly_pay * num_engineers * 4 * 3,
            'year': weekly_pay * num_engineers * 4 * 3 * 4
        }
        self.resample_period = self.menus['resample_period'][0]

        self.groupby_dict = {
            'tw_mentions': 'sum',
            'tw_positive': 'mean',
            'tw_compound': 'mean',
            'tw_neutral': 'mean',
            'tw_negative': 'mean',
            'tw_emojis_positive': 'mean',
            'tw_emojis_compound': 'mean',
            'tw_emojis_negative': 'mean',
            'tw_emojis_count': 'sum',
            'tw_replies_from_followers': 'sum',
            'tw_replies_from_following': 'sum',
            'tw_reply_hashtags': 'sum'
        }

        self.pop_history_periods = 3  # number of periods for period over period
        self.variable = 'item'
        self.grouby_var = ''
        self.page_width = 1200
class KPI:
    menus = {
        'account_type':
        ['all', 'contract', 'miner', 'native_user', 'token_user'],
        'update_type': [
            'all', 'contract_deployment', 'internal_transfer', 'mined_block',
            'token_transfer', 'transaction'
        ],
        'history_periods': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
        'developer_adoption_DVs': ['aion_fork', 'aion_watch'],
        'resample_period': ['W', 'M', 'Q'],
        'social_media': ['twitter', 'facebook'],
        'social_media_variables': [
            'tw_mentions', 'tw_positive', 'tw_compound', 'tw_neutral',
            'tw_negative', 'tw_emojis_positive', 'tw_emojis_compound',
            'tw_emojis_negative', 'tw_emojis_count',
            'tw_replies_from_followers', 'tw_replies_from_following',
            'tw_reply_hashtags'
        ],
        'cryptos': ['all'] + load_cryptos(),
        'bcc': {
            'rental': ['area', 'category', 'item', 'status', 'gender']
        }
    }

    def __init__(self, table, name, cols):
        self.df = None
        self.ch = PythonClickhouse('aion')
        self.redis = PythonRedis()
        self.table = table
        self.cols = cols
        self.div_style = """ style='width:350px; margin-left:25px;
                                border:1px solid #ddd;border-radius:3px;background:#efefef50;' 
                                """

        self.header_style = """ style='color:blue;text-align:center;' """
        self.welcome_txt = """<div style="text-align:center;background:black;width:100%;">
                                         <h1 style="color:#fff;">
                                         {}</h1></div>""".format('Welcome')
        css_path = join(dirname(__file__),
                        "../../../static/css/KPI_interface.css")
        self.KPI_card_css = KPI_card_css
        self.DATEFORMAT = '%Y-%m-%d %H:%M:%S'
        self.DATEFORMAT_PTD = '%Y-%m-%d'

        self.initial_date = datetime.strptime("2018-04-25 00:00:00",
                                              self.DATEFORMAT)
        self.account_type = 'all'
        self.trigger = -1
        self.periods_to_plot = {1: ['week', 'month'], 2: ['quarter']}
        self.pop_history_periods = 3  # number of periods for period over period
        self.pop_start_date = None
        self.pop_end_date = None
        self.timestamp_col = ''
        self.checkboxgroup = {}
        self.sig_effect_dict = {}
        self.name = name
        self.redis_stat_sig_key = 'adoption_features:' + self.name
        self.card_grid_row = {'year': 0, 'quarter': 1, 'month': 2, 'week': 3}
        weekly_pay = 1200
        num_engineers = 40
        self.payroll = {
            'week': weekly_pay * num_engineers,
            'month': weekly_pay * num_engineers * 4,
            'quarter': weekly_pay * num_engineers * 4 * 3,
            'year': weekly_pay * num_engineers * 4 * 3 * 4
        }
        self.resample_period = self.menus['resample_period'][0]

        self.groupby_dict = {
            'tw_mentions': 'sum',
            'tw_positive': 'mean',
            'tw_compound': 'mean',
            'tw_neutral': 'mean',
            'tw_negative': 'mean',
            'tw_emojis_positive': 'mean',
            'tw_emojis_compound': 'mean',
            'tw_emojis_negative': 'mean',
            'tw_emojis_count': 'sum',
            'tw_replies_from_followers': 'sum',
            'tw_replies_from_following': 'sum',
            'tw_reply_hashtags': 'sum'
        }

        self.pop_history_periods = 3  # number of periods for period over period
        self.variable = 'item'
        self.grouby_var = ''
        self.page_width = 1200

        # make block timestamp the index
    def load_df(self,
                start_date,
                end_date,
                cols,
                timestamp_col='timestamp_of_first_event',
                supplemental_where=None):
        try:

            if isinstance(end_date, date):
                end_date = datetime.combine(end_date, datetime.min.time())
            if isinstance(start_date, date):
                start_date = datetime.combine(start_date, datetime.min.time())
            end_date += timedelta(days=1)
            temp_cols = cols.copy()

            if self.table != 'external_daily':
                if 'amount' not in temp_cols:
                    temp_cols.append('amount')

            df = self.ch.load_data(self.table, temp_cols, start_date, end_date,
                                   timestamp_col, supplemental_where)
            # filter out the double entry
            #df = df[df['value'] >= 0]
            if len(cols) > 0:
                return df[cols]
            else:
                return df
            #df[timestamp_col] = df[timestamp_col].map(lambda x: clean_dates_from_db(x))
        except Exception:
            logger.error('load df', exc_info=True)

    def load_df_pym(self, req_startdate, req_enddate, table, cols,
                    timestamp_col):
        try:
            # get min and max of loaded df
            if self.df is not None:
                loaded_min = self.df[timestamp_col].min()
                loaded_max = self.df[timestamp_col].max()

                if loaded_min <= req_startdate and loaded_max >= req_enddate:
                    df = self.df[(self.df[timestamp_col] >= req_startdate)
                                 & (self.df[timestamp_col] <= req_enddate)]
                    return df
            return self.pym.load_df(req_startdate,
                                    req_enddate,
                                    table=table,
                                    cols=cols,
                                    timestamp_col=timestamp_col)

        except Exception:
            logger.error('load_df', exc_info=True)

    def update_cards(self, dct):
        try:
            txt = ''
            for period, data in dct.items():
                design = random.choice(list(KPI_card_css.keys()))
                title = period + ' to date'
                txt += self.card(title=title, data=data, card_design=design)

            text = """<div style="margin-top:100px;display:flex; flex-direction:row;">
                                                {}
                                                </div>""".format(txt)

            self.KPI_card_div.text = text

        except Exception:
            logger.error('update cards', exc_info=True)

    def reset_checkboxes(self, value='all', checkboxgroup=''):
        try:
            self.checkboxgroup[checkboxgroup].value = value
        except Exception:
            logger.error('reset checkboxes', exc_info=True)

    def first_date_in_quarter(self, timestamp):
        try:
            curr_quarter = int((timestamp.month - 1) / 3 + 1)
            return datetime(timestamp.year, 3 * curr_quarter - 2, 1)

        except Exception:
            logger.error('period to date', exc_info=True)

    def first_date_in_period(self, timestamp, period):
        try:
            if period == 'week':
                start = timestamp - timedelta(days=timestamp.weekday())
            elif period == 'month':
                start = datetime(timestamp.year, timestamp.month, 1, 0, 0, 0)
            elif period == 'year':
                start = datetime(timestamp.year, 1, 1, 0, 0, 0)
            elif period == 'quarter':
                start = self.first_date_in_quarter(timestamp)
            return start
        except Exception:
            logger.error('period to date', exc_info=True)

    def period_to_date(self,
                       df,
                       timestamp=None,
                       timestamp_filter_col=None,
                       cols=[],
                       period='week'):
        try:
            if timestamp is None:
                timestamp = datetime.now()
                timestamp = datetime(timestamp.year, timestamp.month,
                                     timestamp.day, timestamp.hour, 0, 0)

            start = self.first_date_in_period(timestamp, period)
            # filter
            if timestamp_filter_col is None:
                timestamp_filter_col = self.timestamp_col

            #logger.warning('df:%s',df[timestamp_filter_col])

            df = df[(df[timestamp_filter_col] >= start)
                    & (df[timestamp_filter_col] <= timestamp)]
            if len(cols) > 0:
                df = df[cols]
            return df
        except Exception:
            logger.error('period to date', exc_info=True)

    def label_qtr_pop(y):
        try:
            curr_quarter = int((y.month - 1) / 3 + 1)
            start = datetime(y.year, 3 * curr_quarter - 2, 1)
            return abs((start - y).days)
        except Exception:
            logger.error('df label quarter', exc_info=True)

    def shift_period_range(self, period, start, end):
        try:
            if period == 'week':
                start = start - timedelta(days=7)
                end = end - timedelta(days=7)
            elif period == 'month':
                start = start - relativedelta(months=1)
                end = end - relativedelta(months=1)
            elif period == 'year':
                start = start - relativedelta(years=1)
                end = end - relativedelta(years=1)
            elif period == 'quarter':
                start = start - relativedelta(months=3)
                end = end - relativedelta(months=3)
            #logger.warning('%s start:end=%s:%s',period,start,end)
            return start, end
        except Exception:
            logger.error('shift period range', exc_info=True)

    # label dates for period over period (pop)
    def label_dates_pop(self, df, period, timestamp_col):
        logger.warning('timestamp col:%s', df.head(10))

        def label_qtr_pop(y):
            try:
                curr_quarter = int((y.month - 1) / 3 + 1)
                start = datetime(y.year, 3 * curr_quarter - 2, 1)
                return abs((start - y).days)
            except Exception:
                logger.error('df label quarter', exc_info=True)

        try:
            if len(df) > 0:
                if period == 'week':
                    df = df.assign(
                        dayset=lambda x: x[timestamp_col].dt.dayofweek)
                elif period == 'month':
                    df = df.assign(dayset=lambda x: x[timestamp_col].dt.day)
                elif period == 'year':
                    df = df.assign(
                        dayset=lambda x: x[timestamp_col].dt.dayofyear)
                elif period == 'quarter':
                    df['dayset'] = df[timestamp_col].map(label_qtr_pop)

            return df
        except Exception:
            logger.error('label data ', exc_info=True)

    def pop_include_zeros(self, df_period, plotcols, period):
        try:
            # check for no data on original dates
            tmp_title = '0 {}(s) prev(current)'.format(period)
            if tmp_title not in plotcols:
                df_period[tmp_title] = [0] * len(df_period)
                plotcols.append(tmp_title)

                logger.warning('line 218 cols to plot:%s', plotcols)
            # do other periods
            tmp = plotcols[0]
            txt = tmp[1:]
            if isinstance(self.pop_history_periods, str):
                self.pop_history_periods = int(self.pop_history_periods)
            for i in range(1, self.pop_history_periods):
                tmp_txt = str(i) + txt
                if tmp_txt not in plotcols:
                    df_period[tmp_txt] = [0] * len(df_period)
                    plotcols.append(tmp_txt)

            logger.warning('LINE 158 plotcols at end of pop include zeros:%s',
                           plotcols)

            return df_period, sorted(plotcols)
        except Exception:
            logger.error('pop include zeros', exc_info=True)

    def period_over_period(self,
                           df,
                           start_date,
                           end_date,
                           period,
                           history_periods=2,
                           timestamp_col='timestamp_of_first_event'):
        try:
            # filter cols if necessary
            string = '0 {}(s) prev(current)'.format(period)

            # filter out the dates greater than today
            df_current = df.assign(period=string)
            # label the days being compared with the same label
            if len(df_current) > 0:
                df_current = self.label_dates_pop(df_current, period,
                                                  timestamp_col)

            # zero out time information
            start = datetime(start_date.year, start_date.month, start_date.day,
                             0, 0, 0)
            end = datetime(end_date.year, end_date.month, end_date.day, 0, 0,
                           0)

            cols = list(df.columns)
            logger.warning(' Line 293 %s:df %s', period, df.head(10))
            logger.warning(' Line 293 %s:df cols %s', period, cols)

            counter = 1
            if isinstance(history_periods, str):
                history_periods = int(history_periods)
            # make dataframes for request no. of periods
            start, end = self.shift_period_range(period, start, end)
            while counter < history_periods and start >= self.initial_date:
                # load data
                if period == 'quarter':
                    logger.warning('start:end %s:%s', start, end)
                if 'bcc' in self.table:
                    df_temp = self.load_df_pym(start, end, cols, timestamp_col)
                else:
                    df_temp = self.load_df(start, end, cols, timestamp_col)
                if df_temp is not None:
                    if len(df_temp) > 1:
                        string = '{} {}(s) prev'.format(counter, period)
                        # label period
                        df_temp = df_temp.assign(period=string)
                        # relabel days to get matching day of week,doy, dom, for different periods
                        df_temp = self.label_dates_pop(df_temp, period,
                                                       timestamp_col)
                        #logger.warning('df temp loaded for %s previous: %s',counter,len(df_temp))

                        df_current = concat_dfs(df_current, df_temp)
                        del df_temp
                        gc.collect()

                # shift the loading window
                counter += 1
                start, end = self.shift_period_range(period, start, end)
                if period == 'week':
                    logger.warning('LINE 327 df_current:%s',
                                   df_current.head(10))

            return df_current
        except Exception:
            logger.error('period over period', exc_info=True)

    def pop_week(self, launch=-1):
        try:
            return self.graph_period_over_period('week')
        except Exception:
            logger.error('pop week', exc_info=True)

    def pop_month(self, launch=-1):
        try:
            return self.graph_period_over_period('month')
        except Exception:
            logger.error('pop month', exc_info=True)

    def pop_quarter(self, launch=-1):
        try:
            return self.graph_period_over_period('quarter')
        except Exception:
            logger.error('pop quarter', exc_info=True)

    def pop_year(self, launch=-1):
        try:
            return self.graph_period_over_period('year')
        except Exception:
            logger.error('pop year', exc_info=True)

    """
     To enable comparision across period, dates must have label relative to period start.
     Place dates in columns to be able to plot multi-line/bar graphs
     
    """

    def split_period_into_columns(self, df, col_to_split, value_to_copy):
        try:
            for item in df[col_to_split].unique():
                df[item] = df.apply(lambda x: x[value_to_copy]
                                    if x[col_to_split] == item else 0,
                                    axis=1)
            #logger.warning('split period into columns:%s', df.head(10))
            return df
        except Exception:
            logger.error('split period into column', exc_info=True)

    # -----------------------  UPDATERS  ------------------------------------------
    def card(self, title, data, width=200, height=200, card_design='folders'):
        try:
            txt = """
            <div style="flex: 1 1 0px;border: 1px solid black;{};width:{}px;
                        height:{}px;border-right=10px;">
                <h3>
                    {}
                </h3>
                </br>
                {}
            </div>""".format(self.KPI_card_css[card_design], width, height,
                             title, data)
            return txt
        except Exception:
            logger.error('card', exc_info=True)

    def notification_updater(self, text):
        txt = """<hr/><div style="text-align:center;width:{}px;height:{}px;
                              position:relative;background:black;">
                              <h1 style="color:#fff;margin-bottom:300px">{}</h1>
                        </div>""".format(self.page_width, 50, text)
        for key in self.notification_div.keys():
            self.notification_div[key].text = txt

    """
        update the section labels on the page

    """

    def section_header_updater(self, section, label='all'):
        if label not in ['all', '', 'remuneration']:
            label = label + 's'
        if section == 'cards':
            text = "Period to date:"
            if label == 'remuneration':
                text = text + '$ spent'
            if label == 'project':
                text = text + '# of projects'
            if label == 'delay_start':
                text = text + 'Mean delay in start projects(hours)'
            if label == 'delay_end':
                text = text + 'Mean project overrun(hours)'
            if label == 'project_duration':
                text = text + 'Mean project duration (days)'
            if label == 'task_duration':
                text = text + 'Total project person hours)'
        elif section == 'pop':
            text = "Period over period:{}".format(label)

        txt = """<h2 style="color:#4221cc;">{}-----------------------------------------------------------------</h2>"""\
            .format(text)
        self.section_headers[section].text = txt

    # -------------------- CALCULATE KPI's DEVELOPED FROM VARIABLES WITH STATITICALLY SIGNIFICANT EFFECT
    def card_text(self, title, data, card_design='folders'):
        try:
            txt = """
            <div {}>
            <h3>{}</h3></br>{}
            </div>
            """.format(self.KPI_card_css[card_design], title, data)
            return txt
        except Exception:
            logger.error('card text', exc_info=True)

    def match_sigvars_to_coin_vars(self, df, interest_var):
        try:
            # load statistically significant variables
            key = self.redis_stat_sig_key + '-' + interest_var
            # adjust the variable of interest to match the key
            key_vec = key.split(
                '-')  # strip the crypto name off of he variable
            gen_variables = [
                'release', 'watch', 'push', 'issue', 'fork', 'open', 'high',
                'low', 'close', 'volume', 'market_cap'
            ]
            for var in gen_variables:
                if var in key_vec[-1]:
                    key = key_vec[-2] + '-' + var
                    break

            sig_variables = self.redis.simple_load(key)
            self.sig_effect_dict = {}
            significant_features = {}
            # make a list of columns with names that include the significant feature
            if sig_variables is not None:
                if 'features' in sig_variables.keys():
                    if len(sig_variables['features']) > 0:
                        for col in df.columns:
                            if any(var in col
                                   for var in sig_variables['features']):
                                significant_features[col] = 'sum'
            return significant_features
        except Exception:
            logger.error('match sig vars to coin vars', exc_info=True)

    def calc_sig_effect_card_data(self, df, interest_var, period):
        try:

            significant_features = self.match_sigvars_to_coin_vars(
                df, interest_var=interest_var)
            if len(significant_features) > 0:
                cols = [interest_var] + list(significant_features.keys())
                tmp_df = df[cols]
                numer = tmp_df[interest_var].sum()

                variable_of_interest_tmp = interest_var.split('_')
                if variable_of_interest_tmp[-1] in ['watch']:
                    variable_of_interest_tmp[-1] += 'e'
                txt = ''
                for var in significant_features.keys():
                    point_estimate = 0
                    var_tmp = var.split(
                        '_')  # slice out the 'fork' from 'aion_fork'
                    if numer != 0:
                        denom = tmp_df[var].sum()
                        point_estimate = '*'
                        if denom != 0:
                            point_estimate = round(numer / denom, 3)
                    # add metrics based on variables
                    # update the divs
                    self.sig_effect_dict[var] = {
                        'title':
                        "{}s per {}".format(variable_of_interest_tmp[-1],
                                            var_tmp[-1]),
                        'point_estimate':
                        point_estimate
                    }

                    txt += self.card(
                        title=self.sig_effect_dict[var]['title'],
                        data=self.sig_effect_dict[var]['point_estimate'],
                        card_design=random.choice(
                            list(self.KPI_card_css.keys())))

                    return txt

        except Exception:
            logger.error('make sig effect columns', exc_info=True)

    def update_significant_DV_cards(self, dct):
        try:
            txt = ''
            for idx, period in enumerate(dct.keys()):
                txt += dct[period]
            text = """<div style="margin-top:100px;display:flex; flex-direction:column;">
                                            {}
                       </div>""".format(txt)

            self.KPI_card_div.text = text

        except Exception:
            logger.error('update cards', exc_info=True)

    def payroll_to_date(self, period):
        try:
            # make data cards
            # number of weeks in period
            if period == 'year':
                weekcount = datetime.now().isocalendar()[1]
                payroll_to_date = self.payroll['week'] * weekcount
            elif period == 'week':
                payroll_to_date = self.payroll['week'] * (
                    datetime.today().weekday() / 7)
            elif period == 'month':
                weekcount = floor(
                    datetime.today().day / 7) + 1  # no zero week allowed
                payroll_to_date = self.payroll['week'] * weekcount
            elif period == 'quarter':
                start = self.first_date_in_quarter(datetime.today())
                weekcount = floor(
                    (abs(datetime.today() - start).days + 1) / 7) + 1
                payroll_to_date = self.payroll['week'] * weekcount

            return round(payroll_to_date, 2)
        except Exception:
            logger.error('payroll to date', exc_info=True)

    """
        groupby the the data and make ratios between 
        significant variables and interest variables
    """

    def make_significant_ratios_df(self, df, resample_period, interest_var,
                                   timestamp_col):
        try:

            def ratio(df, col_old, col_new):
                df = df.assign(result=df[interest_var] / df[col_old])
                df = df.rename(columns={'result': col_new})
                #logger.warning('col-%s df:%s',col_old,df.head(5))

                return df

            # filter
            sig_features_dict = self.match_sigvars_to_coin_vars(
                df, interest_var)
            sig_features_dict[
                interest_var] = 'sum'  # include interest var in aggregations
            sig_features_list = list(sig_features_dict.keys())
            # rename column for overwriting
            sig_vars_relabel = []
            for feature in sig_features_list:
                tmp = feature.split('_')
                sig_vars_relabel.append(tmp[-1])
            # groupby
            df = df.set_index(timestamp_col)

            df = df.resample(resample_period).agg(sig_features_dict)
            #logger.warning('LINE 413:%s',len(df))

            # create ratios
            for idx, col in enumerate(sig_features_list):
                if col != interest_var:  # skip variable of interest
                    df = df.map_partitions(ratio, col, sig_vars_relabel[idx])

            # drop columns
            df = df.drop(sig_features_list, axis=1)
            df = df.fillna(0)
            return df
        except Exception:
            logger.error('significant ratios', exc_info=True)
Example #7
0
from dask.dataframe.utils import make_meta

from scripts.utils.mylogger import mylogger
from scripts.databases.pythonRedis import PythonRedis
import gc
import re
from datetime import datetime
import pandas as pd

from scripts.utils.myutils import datetime_to_date

r = PythonRedis()
logger = mylogger(__file__)


def remove_char(row):
    return re.sub('\[', '', row['transaction_hashes'])


def list_to_rows(df, column, sep=',', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str