def economicactivity_info(engine, id_company):
    """
    Function to print economic activity from taxes and inpsections tables
        engine: query
        id_company: rutempresamask
    """
    qry = """set role direccion_trabajo_inspections_write;
        select gcae, ccae, count(*)
        from raw.inspections_complete
        where rutempresamask = '{}'
        group by gcae, ccae
        order by ccae;""".format(id_company)
    tab_ins = plsql.query(qry, engine)
    print('Economic activity from Inpsections data: \n')
    print(tab_ins)
    print('\n')

    qry = """set role direccion_trabajo_inspections_write;
        select actividadeconomica, count(*)
        from raw.taxes
        where rutmask = '{}'
        group by actividadeconomica
        order by count desc;""".format(id_company)
    tab_tax = plsql.query(qry, engine)
    print('Economic activity from Taxes data:\n')
    print(tab_tax)
def freq_matters_bookupdated(engine, role, id_company):
    
    """
    Function to create frequency table of matters infracted
        engine: to query
        role: role for query
        id_company: rutempresamask
    """
    
    qry = """set role {};
    select case when trim(matter_code) = '-' then '' else trim(matter_code) end as matter, count(*)
    from cleaned.infracted_matters_updatedbook
    where rutempresamask = {}
    group by matter
    order by matter;""".format(role, "'" + id_company + "'")
    tab = plsql.query(qry, engine)

    qry = """set role {};
        select case when trim(codigo) = '-' then '' else trim(codigo) end as matter, 
        trim(lower(regexp_replace(statement , '\.', ''))) as state,
        count(*)
        from raw.tipificador
        group by matter, state;""".format(role)
    tab_tipi = plsql.query(qry, engine)
    tab_join = tab.merge(tab_tipi, left_on='matter', right_on='matter', how='inner')
    tab_result = tab_join.groupby('matter').apply(first_row).sort_values('count_x', ascending = False)
    
    return tab_result
Ejemplo n.º 3
0
def proportion_nulls_all_columns(engine,
                                 role,
                                 schema,
                                 table,
                                 na='__null__',
                                 ind=False):
    props = []
    counts = []
    qry = """
        set role {};
        select column_name 
        from information_schema.columns 
        where table_name = '{}' 
        and table_schema = '{}';""".format(role, table, schema)
    result = plsql.query(qry, engine)

    for i, row in result.iterrows():
        nulls = count_nulls(engine, role, schema, table, row.column_name, na,
                            ind).values
        length = total_rows(engine, role, schema, table).values
        counts.append(float(nulls))
        props.append(float(nulls / length))

    result['count'] = counts
    result['proportion'] = props

    return result
Ejemplo n.º 4
0
def model_id_graphs(model_id, role, engine, path):
    """
    Running and save graphs for every model
    """
    print('Creating graphs.......')

    qry = """set role {}; 
            select *
            from results.models
            where model_id = {};
    """.format(role, model_id)
    tab_name = plsql.query(qry, engine)

    model_group_id = tab_name.model_group_id[0]
    model_type = tab_name.model_type[0]

    fig_name = (
        "group_{group_model_id}_modelid_{model_id}_{model_type}.png".format(
            group_model_id=tab_name.model_group_id[0],
            model_id=model_id,
            model_type=tab_name.model_type[0])).lower()
    fig_path = path

    gg_hist_all, gg_hist_lab = model_id_score_gghist(model_id, model_group_id,
                                                     model_type, role, engine)
    gg_hist_all.save(filename=fig_name, path=fig_path + 'histogram_all/')
    gg_hist_lab.save(filename=fig_name, path=fig_path + 'histogram_label/')

    gg_prec = model_id_precsion_recall(model_id, role, engine)
    gg_prec.save(filename=fig_name, path=fig_path + 'precision/')

    gg_impo = model_id_feature_importance(model_id, role, engine)
    gg_impo.save(filename=fig_name, path=fig_path + 'features/')
def facility_info(engine, id_company):
    """
    Function to print company's general information
        engine: query
        id_company: rutempresamask
    """
    qry = """set role direccion_trabajo_inspections_write;
        select *
        from raw.inspections_complete
        where rutempresamask = {};""".format("'" + id_company + "'")
    tab_emp = plsql.query(qry, engine)
    
    total = tab_emp.idfiscalizacion.count()
    print('Total inspections:', tab_emp.idfiscalizacion.count())
    print('Total inspections with violations:', tab_emp.infractor.sum())
    print('% inspections with violations:', round(100*tab_emp.infractor.sum()/total) )
    
    
    print("\n")
    print('Total proactive inspections:', sum(tab_emp["solicitante"] == 'Por Programa'))
    print('Total reactive inspections:', sum(tab_emp["solicitante"] != 'Por Programa'))
    print('% proactive inspections:', round(100*sum(tab_emp["solicitante"] == 'Por Programa')/total) )
    
    print("\n")
    print('Total urgent inspections:', tab_emp.urgencia.sum())
    print('% urgent inspections:', round(100*tab_emp.urgencia.sum()/total) )
    
    print("\n")
    print('Total matters inspected:', tab_emp.num_materias.sum().astype('int'))
    print('Total matters infractioned:', tab_emp.infra.sum().astype('int'))
    print('% matters infractioned:', round(100*tab_emp.infra.sum().astype('int')/tab_emp.num_materias.sum().astype('int')) )
Ejemplo n.º 6
0
def rows_by_group(engine, role, schema, table, group):
    qry = """set role {};
        select {}, count(*) from {}.{}
        group by {}
        order by {};
        """
    result = plsql.query(qry.format(role, group, schema, table, group, group),
                         engine)
    return result
Ejemplo n.º 7
0
def get_table(engine,
              y,
              X,
              start_month,
              end_month,
              label_query,
              nulls=True,
              sample=False,
              sample_level=.8):
    # Test data: include nulls if nulls == True:
    if nulls == True:
        na = ""
    # Training data: don't include nulls
    else:
        na = " where {} is not null".format(y)

    # If undersampling data to correct imbalance
    if sample == True:
        samp = "random_system_not_feature > {} or {} is true".format(
            sample_level, y)
        if nulls == False:
            samp = " and " + samp
        else:
            samp = " where " + samp
    else:
        samp = ""

    # Get strings for correct table name
    table_start = start_month.strftime('%Y-%m-%d').replace('-', '')
    table_end = end_month.strftime('%Y-%m-%d').replace('-', '')

    # Load in the model matrix
    lab_qry = label_query.format(table_start=table_start, table_end=table_end)
    qry = """set role direccion_trabajo_inspections_write;
        with 
        label as ({lab_qry})
        select t.entity_id, t.month_year, {features}, {label}
        from staging.train_monthly_split_{table_start}_{table_end} t
        left join label l
        on t.entity_id = l.entity_id
        and t.month_year = l.month_year
        {na}{samp};
        """.format(lab_qry=lab_qry,
                   features=', '.join(X),
                   label=y,
                   table_start=table_start,
                   table_end=table_end,
                   na=na,
                   label_query=label_query,
                   samp=samp)
    df = plsql.query(qry, engine)

    return df
Ejemplo n.º 8
0
def check_table(engine, start_date, end_date, drop_table=False):

    table_start = start_date.strftime('%Y-%m-%d').replace('-', '')
    table_end = end_date.strftime('%Y-%m-%d').replace('-', '')

    # Check for existence of staging table for TRAINING time split
    df = plsql.query(
        """set role direccion_trabajo_inspections_write;
        select table_name from information_schema.tables
        where table_schema = 'staging'
        and table_name = 'train_monthly_split_{}_{}';
        """.format(table_start, table_end), engine)

    # If doesn't exist or config says to drop table, create new table (TRAINING)
    if df.shape[0] == 0 or drop_table == True:
        utab.create_table(engine, start_date, end_date)
Ejemplo n.º 9
0
def count_nulls(engine, role, schema, table, column, na='__null__', ind=False):

    qry = """set role {};
        select sum(case when {} {} then 1 else 0 end) from {}.{};
        """

    ##### Create NA condition for query
    # If NA value is NULL, do not pass any value for "na" parameter
    # (this is the default case)
    if na == '__null__':
        na_qry = 'is null'
    # If NA value is something other than NULL, pass value
    # with an indicator for whether this value should be treated as a string
    else:
        if ind == False:
            na = "'{}'".format(na)
        na_qry = '= ' + na

    result = plsql.query(qry.format(role, column, na_qry, schema, table),
                         engine)
    return result
Ejemplo n.º 10
0
def model_id_score_gghist(model_id, model_group_id, model_type, role, engine):
    """
    This function creates de scores histogram
    
    Parameters:
    model_id = model id to plot
    group_model_id
    role
    engine    
    
    Return
    histogram
    """
    qry = """set role {}; 
            select * 
            from results.predictions 
            where model_id = {};
            """.format(role, model_id)

    df = plsql.query(engine=engine, qry=qry)

    model_type = model_type
    model_id = model_id
    month_start = df.as_of_date[0]

    gg_hist_label = (
        ggplot(df, aes(x='score')) + geom_histogram(bins=25) +
        ggtitle('{model_type}; model_id = {model_id},\n{month_start} '.format(
            model_type=model_type, model_id=model_id, month_start=month_start))
        + facet_wrap('~label_value', scales='free_y') +
        theme(figure_size=(5, 3)))

    gg_hist_all = (
        ggplot(df, aes(x='score')) + geom_histogram(bins=25) +
        ggtitle('{model_type}; model_id = {model_id},\n{month_start} '.format(
            model_type=model_type, model_id=model_id,
            month_start=month_start)) + theme(figure_size=(4, 3)))

    return gg_hist_all, gg_hist_label
Ejemplo n.º 11
0
def model_id_precsion_recall(model_id, role, engine):
    qry = """set role {}; 
            select * from
                (select model_id, model_type,
                    value, evaluation_start_time, 
                    evaluation_end_time, prediction_frequency,
                    split_part(metric, '|', 1) AS metric_name,
                    split_part(metric, '|', 2) AS null_param,
                    split_part(metric, '|', 3) AS cutoff,
                    split_part(metric, '|', 4) AS type,
                    split_part(metric, '|', 5) AS subset
                from results.evaluations a
                left join 
                    (select model_id, model_group_id, model_type
                    from results.models) b
                using (model_id) 
                where a.model_id = {}) s
            where s.subset = 'all_data' and
                  s.type = 'pct';
                """.format(role, model_id)

    df = plsql.query(engine=engine, qry=qry)
    tab = (df[df['metric_name'].isin(['precision', 'recall',
                                      'fallout'])]).reset_index()
    tab.cutoff = tab.cutoff.astype('float')

    gg_precrecall = (ggplot(
        tab,
        aes(x='cutoff', y='value', color='metric_name', group='metric_name')) +
                     geom_line(size=1) + xlab('Population (Prop.)') +
                     ylab('Value') + facet_wrap('~null_param') + ggtitle(
                         '{model_type}; model_id = {model_id},\n{month_start} '
                         .format(model_type=df.model_type[0],
                                 model_id=df.model_id[0],
                                 month_start=df.evaluation_start_time[0])) +
                     theme(figure_size=(7, 3)))

    return gg_precrecall
Ejemplo n.º 12
0
def model_id_feature_importance(model_id, role, engine):

    qry = """set role {}; 
            select * 
            from results.feature_importances  a
            left join 
                (select model_id, model_group_id, model_type
                from results.models) b
            using (model_id)
            where a.model_id = {};
            """.format(role, model_id)

    df = plsql.query(engine=engine, qry=qry)

    df_sorted = df.sort_values(by='feature_importance', ascending=True)
    df_sorted = df_sorted.nlargest(20, 'feature_importance')

    df_sorted['feature_clean'] = df_sorted.feature.str.replace(
        "_", " ").str.title()

    ## order the features
    order_features = list(df_sorted.feature_clean)

    ## create categorical version for graph
    df_sorted['feature_categorical'] = df_sorted.feature_clean.astype(
        'category', ordered=True, categories=order_features)

    gg_importance = (
        ggplot(df_sorted, aes(x='feature_categorical', y='feature_importance'))
        + geom_bar(stat='identity', fill='#1f497d', alpha=0.2, color='black') +
        xlab('Feature') + ylab('Importance') +
        ggtitle('{model_type}; model_id = {model_id} '.format(
            model_type=df.model_type[0], model_id=df.model_id[0])) +
        theme(axis_text_x=element_text(size=7, angle=90), figure_size=(7, 3)))

    return gg_importance
Ejemplo n.º 13
0
def count_distinct(engine, role, schema, table, column):
    qry = """set role {};
        select count(distinct {}) from {}.{};
        """
    result = plsql.query(qry.format(role, column, schema, table), engine)
    return result
Ejemplo n.º 14
0
def total_rows(engine, role, schema, table):
    qry = """set role {};
        select count(*) from {}.{};
        """
    result = plsql.query(qry.format(role, schema, table), engine)
    return result