Ejemplo n.º 1
0
def type_cast():
    conn, cur = create_pgconn()
    cur.execute('set role el_salvador_mined_education_write')
    sql_stmnt = 'set role el_salvador_mined_education_write;'

    for sql_file, table in zip(['features_imputation','features_onehot','features_aggregate','features_timeaggregates_small'],['features_imputed','features_one_hot','features_aggregate','features_timeaggregates']):
        
        features = feature_list_from_SQL(f'{sql_file}')
        sql_stmnt += f"create table results.{table}_typecast as (select year_range, student"

        for feature in features:
            if feature not in ['year_range','student']:
    #             logger.debug(f'{feature}')
                cur.execute(f"select data_type from information_schema.columns where table_schema='results' and table_name = '{table}' and column_name='{feature}';")
                feature_type = cur.fetchall()[0][0]
                if feature_type in ['integer','boolean']:
                    logger.debug(f"{feature}: {feature_type}: int2")
                    sql_stmnt += f', {feature}::int::int2'
                elif feature_type in ['double precision', 'numeric']:
                    logger.debug(f"{feature}: {feature_type}: float8")
                    sql_stmnt += f', {feature}::float8'
                else:
                    logger.warn(f'{feature}: Type not caught')
        sql_stmnt += f" from results.{table});"
    with open(SQL_DIR + 'features_typecast.sql', 'w+') as fil:
        fil.write(sql_stmnt)
Ejemplo n.º 2
0
def nonimputed_features():
    conn, cur = create_pgconn()
    cur.execute('set role el_salvador_mined_education_write')
    imputed_features = set(feature_list_from_SQL('features_imputation'))
    aggregate_features = set(feature_list_from_SQL('features_aggregate'))
    nonimputed_features = aggregate_features.difference(imputed_features)
    logger.debug(nonimputed_features)
    sql_statement = f"-- Feature list: {','.join(nonimputed_features)}\nset role el_salvador_mined_education_write; drop table if exists results.features_nonimputed; create table results.features_nonimputed as (select {','.join(nonimputed_features)} from results.features_aggregate); create index nonimputed_year_range_student_idx on results.features_nonimputed(year_range, student);"
    logger.debug(f'# Features: {len(nonimputed_features)}')
    with open(SQL_DIR + '/features_nonimputed.sql', 'w+') as fil:
        fil.write(sql_statement)
Ejemplo n.º 3
0
def create_single_row_per_student_year(feature_list=None):
    """
        Groups by year and student, and aggregates all the raw features. During the aggregation, new features are created (avg, count...) according to the data type of the column.
        The returned SQL takes ~10min to run
    """
    conn, cur = create_pgconn()
    cur.execute('set role el_salvador_mined_education_write')
    if feature_list is None:
        feature_list = feature_list_from_SQL('features_raw')
        logger.debug(feature_list)
        
    aggreg_sql = """set role el_salvador_mined_education_write; drop table if exists results.features_aggregate;
        create table results.features_aggregate as (
            select 
                year_range, student,
                """
    
    aggregate_feature_list = ['year_range','student']
    constant_features = ['raw_age', 'student_female']
    for feature in feature_list:
        if feature not in ['year_range','student']:
            logger.debug(f'{feature}')
            cur.execute(f"select data_type from information_schema.columns where table_name = 'features_raw' and column_name='{feature}';")
            feature_type = cur.fetchall()[0][0]
            logger.debug(f'{feature}: {feature_type}')
            if feature in constant_features:
                # Constant features are the ones that if there are multiple values, it is due to data noise, and are hence set to null.
                aggreg_sql += f"case when array_length(array_agg(distinct {feature}), 1)=1 then (array_agg(distinct {feature}))[1] else null::{feature_type} end as {feature},"
                aggregate_feature_list += [f'{feature}']
            elif feature_type == 'boolean':
                aggreg_sql += f"bool_or({feature}) as {feature}_any, bool_and({feature}) as {feature}_all,"
                aggregate_feature_list += [f'{feature}_any']
                aggregate_feature_list += [f'{feature}_all']
            elif feature_type in ['bigint','smallint','double precision', 'numeric', 'integer']:
                aggreg_sql += f"avg({feature}) as {feature}_avg, max({feature}) as {feature}_max, min({feature}) as {feature}_min,"
                aggregate_feature_list += [f'{feature}_avg']
                aggregate_feature_list += [f'{feature}_max']
                aggregate_feature_list += [f'{feature}_min']
            elif feature_type == 'text':
                # Text features are aggregated by concatenating strings. e.g. if a student was in Ahuachapán and Usulután in the same year, the departamento column is 'Ahuachapán,Usulután'
                aggreg_sql += f"array_to_string(array_agg(distinct {feature}), ',')  as {feature}, array_length(array_agg(distinct {feature}), 1) as {feature}_count,"
                aggregate_feature_list += [f'{feature}', f'{feature}_count']
                
            else:
                logger.warn(f"Don't know how to convert {feature} with {feature_type}")
 
    aggreg_sql = f"-- Feature list: {','.join(aggregate_feature_list)}\n" + aggreg_sql[:-1] + " from results.features_raw group by year_range, student); create index aggregate_year_range_student_idx on results.features_aggregate(year_range, student);"
#     logger.error(aggreg_sql)
    with open(SQL_DIR + '/features_aggregate.sql', 'w+') as fil:
        fil.write(aggreg_sql)
Ejemplo n.º 4
0
def unique_original_column_names(folder_number):
    """
        Returns the unique column names in the original .xls, .xlsx, .csv or .sav files
        Given a folder_number, it looks inside the raw.column_mapping table and returns the distinct original column names
        
        Args:
            folder_number (int): the folder number, used for finding the relevant rows in raw.column_mapping
    """
    connection, cursor = create_pgconn()
    cursor.execute("set role el_salvador_mined_education_write")
    cursor.execute(f"""select distinct(original_col_name) from raw.column_mapping where table_name ~ 'raw."{folder_number}.*';""" )
    col_names = [x[0] for x in cur.fetchall()]
    col_names.sort()
    return col_names
Ejemplo n.º 5
0
def create_experiment(config_file):
    with open(config_file, 'r') as f:
        configs = yaml.load(f)
        experiment = hash(str(configs)) % ((sys.maxsize + 1) * 2)

        conn, cur = create_pgconn()
        cur.execute('set role el_salvador_mined_education_write')

        df = pd.read_sql(
            f"""SELECT experiment 
                         FROM results.experiments 
                         WHERE experiment = '{experiment}'""", conn)
        if df.empty:
            repo = git.Repo(search_parent_directories=True)
            git_hash = repo.head.object.hexsha
            config_json = json.dumps(configs,
                                     indent=4,
                                     sort_keys=True,
                                     default=str)
            #features = json.dumps(configs['features'], indent=4, sort_keys=True, default=str)
            seed = random.randint(0, 1e6)

            cur.execute(f"""INSERT INTO results.experiments
                (
                experiment,
                git_hash,
                config,
                run_date,
                seed
                )
            VALUES
                (
                '{experiment}',
                '{git_hash}',
                '{config_json}',
                 now(),
                '{seed}'
                )""")
            conn.commit()

        else:
            logger.debug('Experiment already exists.')

        return experiment, configs
Ejemplo n.º 6
0
def one_hot_encode(features=('departamento','municipio','school_departamento','school_municipio','commute')):
    
    conn, cur = create_pgconn()
    cur.execute('set role el_salvador_mined_education_write')
    
    aggreg_sql = ''
    feature_list = []
    for feature in features:
        # The second thing to do with text features is to one_hot_encode them. e.g. have each of the departamentos as a column, so that the student that was in Ahuachapán and Usulután in the same year gets two 1s in those departamento columns and zeros everywhere else.
        cur.execute(f"select distinct {feature} from results.features_raw;")
        one_hot_columns = [x[0] for x in cur.fetchall()]
        logger.debug(one_hot_columns)
        if len(one_hot_columns) < 300:
            for col in one_hot_columns:
                if col is not None:
                    aggreg_sql += f"case when array['{col}'] <@ string_to_array({feature}, ',') then true else false end as {feature + '_' + col.replace(' ', '_').replace('.','')},"
                    feature_list += [f"{feature + '_' + col.replace(' ', '_')}"]
    sql_statement = f"-- Feature list: {','.join(feature_list)}\nset role el_salvador_mined_education_write; drop table if exists results.features_one_hot; create table results.features_one_hot as (select year_range, student, {aggreg_sql[:-1]} from results.features_imputed a); create index onehot_year_range_student_idx on results.features_one_hot(year_range, student);"
    logger.debug(f'# Features: {len(feature_list)}')
    with open(SQL_DIR + '/features_onehot.sql', 'w+') as fil:
        fil.write(sql_statement)
Ejemplo n.º 7
0
def create_features(config_dict):
    """
        Creates features for ML. Receives a config dictionay which determines the location of the sql file

        :param sql_file: path to sql file with features
        :type df: string

        :return: None
    """

    sql_file = config_dict['features'][0]  # NOTE: just for version 0.1
    logger.debug(SQL_DIR)
    with open(os.path.join(SQL_DIR, sql_file), 'r') as f:
        feature_sql = f.read()
    conn, cursor = create_pgconn()
    logger.debug('Generating features...')
    cursor.execute(feature_sql)
    cursor.execute("select count(*) from results.features;")
    logger.debug(f"Count on table {cursor.fetchone()}.")
    conn.commit()
    conn.close()
    logger.debug('Done generating features. Connection committed')
Ejemplo n.º 8
0
def plot_precision_recall_n(name, experiment, precisionk, recallk,
                            output_type):
    t0 = datetime.datetime.now()
    split_name = name.split('_')
    y_axis = np.arange(0, 101, 1)

    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(y_axis, precisionk, 'b')  # x axis is just 0 - 1
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    plt.suptitle(f"Experiment: {split_name[0]}\nModel: {split_name[1]}")
    ax2 = ax1.twinx()
    ax2.plot(y_axis, recallk, 'r')  # x axis is just 0 - 1
    ax2.set_ylabel('recall', color='r')
    ax1.set_ylim([0, 1])
    ax2.set_ylim([0, 1])
    ax2.set_xlim([0, 101])

    if (output_type == 'save'):
        plt.savefig(os.path.join(PLOTS_DIR, name))
        plt.close()
    elif (output_type == 'show'):
        plt.show()
    else:
        plt.show()

    t1 = datetime.datetime.now()
    total_time = round(((datetime.datetime.now() - t0).total_seconds()) / 60.0,
                       3)
    conn, cursor = create_pgconn()
    cursor.execute("set role el_salvador_mined_education_write;")
    cursor.execute(
        f"""UPDATE results.models SET total_pr_graph_time = '{total_time}' WHERE experiment = '{experiment}'"""
    )
    conn.commit()
    conn.close()
    logger.debug(f'Total time PR graph took: {total_time}')
Ejemplo n.º 9
0
def create_labels(config_dict):
    """
        Creates labels for ML. Receives a config dictionary from which a label_window can be read and passed to the label SQL string.

        :param df: config_dict
        :type df: dict
    """

    with open(os.path.join(SQL_DIR, 'labeling.sql'), 'r') as fil:
        sql_cmnd = fil.read()
    label_window = parse_date_string(
        config_dict['temporal_splits']['label_window'])['years']
    sql_cmnd = sql_cmnd.replace('(label_window)', str(label_window))

    logger.debug('Creating labels...')

    conn, cur = create_pgconn()
    cur.execute(sql_cmnd)
    conn.commit()
    conn.close()

    logger.debug('Done creating labels. Connection committed')

    return
Ejemplo n.º 10
0
def extract_learners(configs, experiment):
    conn, cur = create_pgconn()
    clfs = []
    learners = configs['learners']

    idx = 0
    cur.execute("set role el_salvador_mined_education_write;")
    for learner in learners:
        hyperparameters = ParameterGrid(learner['hyperparameters'])
        for hyperparameter in hyperparameters:
            module_name, class_name = learner['algorithm'].rsplit('.', 1)
            module = importlib.import_module(module_name)
            cls = getattr(module, class_name)
            clf = cls(**hyperparameter)
            clfs.append(clf)
            hyperparameters_json = json.dumps(hyperparameter,
                                              indent=4,
                                              sort_keys=True,
                                              default=str)
            cur.execute(f"""INSERT INTO results.learners (
                learner,
                algorithm,
                experiment,
                hyperparameters
            )
                VALUES
                (
                '{idx}',
                '{learner['algorithm']}',
                '{experiment}',
                '{hyperparameters_json}'
                )""")
            idx += 1
    conn.commit()
    conn.close()
    return clfs
Ejemplo n.º 11
0
def main(folder_number):
    """
    There are three steps for this script:
    1. Goes into the folder <pre_dir/folder_number> and iterates over all the .csv files in them (either , or ^ delimiter).
    2. For each file, it uploads the _body.csv to the `raw` schema, with column names a0,a1,a2... for folder 1; b0,b1,b2... for folder 2; etc
    3. It then, for each file, gets the _head.csv, and maps the new column names (a0, a1, b0, c0...) to the old column names (anio, status, departamento...)
    """

    ## Declaring which directory to process
    ## This is taken from yaml config file called 'ETL.yml'
    with open(BASE_DIR + '/mined/ETL/config.yaml') as config_file:
        yml_dict = yaml.load(config_file)

    ## variables for retrieving files
    data_parent_dir = os.path.join(yml_dict['base_dir'], yml_dict['pre_dir'])
    data_dir = yml_dict['folder_dictionary'][folder_number]['folder']
    working_dir = os.path.join(data_parent_dir, data_dir)
    ext = yml_dict['extension']
    delim = yml_dict['folder_dictionary']['comma_separated']
    schema = yml_dict['schema_name']
    if folder_number in delim:
        delim = ','
    else:
        delim = '^'

    ## Creating connection to postgres
    logger.info('Connecting to postgres')
    connection, cursor = create_pgconn()
    cursor.execute("set role el_salvador_mined_education_write")

    # START! Retrieve files from specified directory in datos
    logger.info('Retrieving body csv files')
    csv_files = list_files(working_dir, [ext])

    if not csv_files:
        logger.debug('ERROR: no files retrieved. Exiting program')

    logger.debug('Files retrieved: {}'.format(csv_files))

    for csv_f in csv_files:
        file_path_bytes = csv_f.encode('latin-1', 'ignore')
        file_path_str = file_path_bytes.decode('utf-8')

        relative_path = file_path_str.replace(
            data_parent_dir, '')  # Retrieves up to 'data_parent_dir'
        split_path = relative_path.split('/')
        table_name = str(folder_number) + '_' + '_'.join(
            split_path[1:]).replace(ext, '')
        table_name_with_schema = schema + '"' + table_name + '"'

        logger.info('Removing column mapping data')
        cursor.execute(
            f"""select distinct(table_name) from raw.column_mapping where table_name ~ '{schema}"{folder_number}_*'"""
        )
        fetch = cursor.fetchall(
        )  # fetchall() returns a list of tuples w/ 2 items in each tuple
        column_mapping_column_list = [
            string for tup in fetch for string in tup
        ]
        if table_name_with_schema in column_mapping_column_list:
            cursor.execute(
                f"""delete from raw.column_mapping where table_name = '{table_name_with_schema}'"""
            )
        else:
            pass

        # TODO: check if base_filename is > 63 characters due to Postgres tablename limit

        logger.info('Starting on file: {}'.format(csv_f.split('/')[-1]))
        logger.info('Table name will be: {}'.format(table_name_with_schema))

        ## Generate CREATE TABLE STATEMENT
        logger.info('Generating create table statement for table {}'.format(
            table_name_with_schema))
        csv_path_head = csv_f.replace(ext, '_head.csv')
        headers_list = list(read_csv(csv_path_head, sep=delim).columns)
        headers_list = [h.lower().replace("'", "") for h in headers_list]
        col_letter = chr(96 + folder_number)
        new_col_header = [
            col_letter + str(head) for head in range(len(headers_list))
        ]
        drop_table_query = f"""DROP TABLE IF EXISTS {table_name_with_schema};"""
        create_table_query = f"""CREATE TABLE {table_name_with_schema} ({' VARCHAR, '.join(new_col_header) + ' VARCHAR'});"""
        logger.debug(create_table_query)
        column_mapping_data = zip(headers_list, new_col_header,
                                  [table_name_with_schema] * len(headers_list))

        ## Creating table.
        logger.info('Dropping table {}'.format(table_name_with_schema))
        cursor.execute(drop_table_query)
        logger.debug('Drop table execution message: ()'.format(
            cursor.statusmessage))
        logger.info('Executing create table statement {}'.format(
            table_name_with_schema))
        cursor.execute(create_table_query)
        logging.debug('Create table execution message: {}'.format(
            cursor.statusmessage))

        ## Inserting data
        logger.info('Inserting into table')
        with open(csv_f, 'r') as csv_file:
            if folder_number in yml_dict['folder_dictionary'][
                    'comma_separated']:
                cursor.copy_expert(
                    f"""COPY {table_name_with_schema} FROM STDIN WITH CSV DELIMITER ',' QUOTE '"' NULL '' """,
                    csv_file)
            else:
                cursor.copy_expert(
                    f"""COPY {table_name_with_schema} FROM STDIN WITH CSV DELIMITER '^' NULL '' """,
                    csv_file)

            cursor.execute(
                f"""select count(*) from {table_name_with_schema};""")
            logging.info('Count on table {}: {}'.format(
                table_name_with_schema, cursor.fetchone()))

        ## Inserting column mapping data
        logger.info('Inserting column mapping data into col_mapping table')
        #TODO: should the tablename and col names in table be hard coded?
        for data in column_mapping_data:
            insert_query = f"""INSERT INTO raw.column_mapping(original_col_name, mapped_col_name, table_name) VALUES ('{data[0]}', '{data[1]}', '{data[2]}' );"""
            cursor.execute(insert_query)
        connection.commit()
    connection.close()

    logger.info(f'{data_dir} folder is completed')
Ejemplo n.º 12
0
def create_time_aggregates(feature_list=None):
    """
    Like in the previous aggregation, boolean features are simplified to any, and numerics are averaged.
    
    The time aggregation pulls from results.features_imputed, staging.labels, results.features_aggregate
    """
    
    conn, cur = create_pgconn()
    cur.execute('set role el_salvador_mined_education_write')
    if feature_list is None:
        imputed_features = set(feature_list_from_SQL('features_imputation'))
        nonimputed_features = set(feature_list_from_SQL('features_nonimputed'))
#         onehot_features = set(feature_list_from_SQL('features_onehot'))
        feature_list = imputed_features.union(nonimputed_features)#.union(onehot_features)
        logger.debug(feature_list)
    
    aggreg_sql = """set role el_salvador_mined_education_write; drop table if exists results.features_timeaggregates; create table results.features_timeaggregates as (select """
    
    aggregate_feature_list = ['year_range','student']
    for feature in feature_list:
        if feature not in ['year_range','student']:
            logger.debug(f'{feature}')
            if feature in imputed_features:
                cur.execute(f"select data_type from information_schema.columns where table_name = 'features_imputed' and column_name='{feature}';")
                dummy_table = 'a'  # dummy_table is just a short hand for calling columns, e.g. a.family_members
            elif feature in nonimputed_features:
                cur.execute(f"select data_type from information_schema.columns where table_name = 'features_aggregate' and column_name='{feature}';")
                dummy_table = 'c'
#             elif feature in onehot_features:
#                 cur.execute(f"select data_type from information_schema.columns where table_name = 'features_onehot' and column_name='{feature}';")
#                 dummy_table = 'd'
            else:
                logger.warned('Not caught by if/else')
            feature_type = cur.fetchall()[0][0]
            logger.debug(f'{feature}: {feature_type}')
            for window in ['1y','2y','3y']:
                if feature_type == 'boolean':
#                     for name in ['any', 'all']:
                    aggreg_sql += f"(bool_and({dummy_table}.{feature}) over w_{window})::int as {feature}_{window}_all,"
                    aggregate_feature_list += [f'{feature}_{window}_all']
                    aggreg_sql += f"(bool_or({dummy_table}.{feature}) over w_{window})::int as {feature}_{window}_any,"
                    aggregate_feature_list += [f'{feature}_{window}_any']
                elif feature_type in ['bigint','smallint','double precision', 'numeric', 'integer']:
                    for name in ['avg','max','min']:
                        aggreg_sql += f"{name}({dummy_table}.{feature}) over w_{window} as {feature}_{window}_{name},"
                        aggregate_feature_list += [f'{feature}_{window}_{name}']
                elif feature_type == 'text':
                    # Text features are aggregated by concatenating strings. e.g. if a student was in Ahuachapán and Usulután in the consecutive years, the departamento_1y column is 'Ahuachapán,Usulután' and the departamento_1y_count is 2
                    aggreg_sql += f"array_to_string(array_agg({dummy_table}.{feature}) over w_{window}, ',')  as {feature}_{window},"
                    aggreg_sql += f"array_length(string_to_array(array_to_string(array_agg(distinct {dummy_table}.{feature}) over w_{window}, ','), ','), 1) as {feature}_{window}_count,"
                    aggregate_feature_list += [f'{feature}_{window}', f'{feature}_{window}_count']
                else:
                    logger.warn(f"Don't know how to convert {feature} with {feature_type}")
    
    previous_dropout_sql = "sum(b.label) over w_label_1y as dropout_1y, sum(b.label) over w_label_2y as dropout_2y, sum(b.label) over w_label_3y as dropout_3y"
    aggregate_feature_list += ['dropout_1y','dropout_2y','dropout_3y']
    
    aggreg_sql = f"-- Feature list: {','.join(aggregate_feature_list)}\n" + aggreg_sql + previous_dropout_sql + " from results.features_imputed a left join staging.labels as b on a.student=b.student and a.year_range=b.year_range left join results.features_aggregate c on a.student=c.student and a.year_range=c.year_range window w_1y as (partition by a.student order by a.year_range asc rows between 1 preceding and current row), w_2y as (partition by a.student order by a.year_range asc rows between 2 preceding and current row), w_3y as (partition by a.student order by a.year_range asc rows between 3 preceding and current row), w_label_1y as (partition by a.student order by a.year_range asc rows between 1 preceding and 1 preceding), w_label_2y as (partition by a.student order by a.year_range asc rows between 2 preceding and 1 preceding), w_label_3y as (partition by a.student order by a.year_range asc rows between 3 preceding and 1 preceding)); create index timeaggregates_year_range_student_idx on results.features_timeaggregates(year_range, student);" # left join results.features_onehot d on a.student=d.student and a.year_range=d.year_range
    
    logger.debug(f'# Features: {len(aggregate_feature_list)}')
    with open(SQL_DIR + '/features_timeaggregates.sql', 'w+') as fil:
        fil.write(aggreg_sql)
Ejemplo n.º 13
0
def join_tables(folder_number, create_table=True):
    """
        Finds all tables of some particular folder inside raw schema and joins them into a single table in preproc schema.
        In the conversion, the columns are renamed using .csv that need to be manually created in garfield.
        The column renaming can occur in one of two ways, as specified by the config file:
            1. The .csv file provides three columns: old column names, new column names and types.
            2. Two .csv files are provided, one for column names and one for column types. Each row in the .csv's corresponds to a table in raw, and the first column should be the table name. The rest of the columns correspond to the columns in the raw.table, e.g. if you want the third column of raw.super_table to be renamed as something_useful, the .csv should have a row in which the first column is raw.super_table and the fourth column is something_useful. Similarly for types.
        In either mode, if the new column name is not included, it will not appear in the joined table. If the data type is not included, it will be treated as varchar or handled as a special case in change_column_types()
        
        Args:
            folder_number (int): the folder number, used for finding the .csv and the tables in the raw schema
            create_table (bool): whether to create the joined table and populate it
    """
    folder_number = int(folder_number)
    with open(BASE_DIR + '/mined/ETL/config.yaml') as config_file:
        configs = yaml.load(config_file)
    
    # Checking whether the column mapping .csv exists. If not, it prints the names of the original columns and returns None
    # With these original column names, the user can create a .csv that maps the original column names to the desired column names for the database
    if not os.path.isfile(f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping.csv"""):
        logger.warn(f"""To join tables you need to create: {folder_number}_column_mapping.csv""")
        [print(x) for x in unique_original_column_names(folder_number)]
        return
    
    conn, cur = create_pgconn()
    cur.execute("set role el_salvador_mined_education_write")
    
    # Some folders contain information about semantically different entities (e.g. folder 1 contains school information but also some aggregate descriptive excels). However, each table has keywords identifying what information they contain (e.g. for folder 1, the keyword is Base_de_Centros). This keyword can be used to join each group of tables into separate tables in the preprocessing schema. This only works for tables that are joined in column_mode_1, where the file_name is included in the csv, allowing the script to select the appropriate columns for the tables being joined.
    if folder_number in configs['join_table_string']:
        string_patterns = configs['join_table_string'][folder_number]
    else:
        string_patterns = ['']
    
    for string_pattern in string_patterns:
        if folder_number in configs['column_mode_1']:
            joined_name = f'{folder_number}_joined' + string_pattern
            # The renaming uses the ordering of the new column names in 1_column_mapping.csv, combined with the automatic naming of columns by csvkit (a,b,c...aa,bb,cc...aaa,bbb,...)
            with open(f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping.csv""", 'r') as fil:
                col_file = fil.read()
            col_map = dict()
            all_new_cols = []
            for pair in col_file.split('\n'):
                lst = pair.split(',')
                key = f'raw."{lst[0]}"'
                # The first column in the csv is meant to the file name, so only select those rows for which we have specified the keyword (string_pattern)
                if string_pattern in key.lower():
                    col_map[key] = dict()
                    new = []
                    mapped = []
                    for idx, new_col in enumerate(lst[1:]):
                        if new_col != '':
                            mapped += ['%s%d' %(chr(96+folder_number), idx)]
                            new += [new_col]
                    col_map[key]['mapped'] = mapped
                    col_map[key]['new'] = new
                    all_new_cols += new
            union = set(all_new_cols)
            create_table_stmnt = 'original_table varchar, year varchar,' + ' varchar, '.join(union) + ' varchar' 
        else:
            logger.debug(f"""Opening {folder_number}_column_mapping.csv""")
            with open(f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping.csv""", 'r') as fil:
                col_map = fil.read()
            # Maps the old column names to the new column names and types, using the .csv file. It takes into account that not all old columns will have names (i.e. we leave them behind in the data), and that not all named columns have types (i.e. they stay as varchars)
            logger.debug(f'Creating mapping dictionary')
            col_names=dict()
            col_types=dict()
            union = []
            for pair in col_map.split('\n'):
                split_pair = pair.split(',')
                if split_pair[1] != '':
                    col_names[split_pair[0]] = split_pair[1]
                    if len(split_pair)>2:
                        if split_pair[2] != '':
                            col_types[split_pair[1]] = split_pair[2]
                    union += [split_pair[1]]
            union = set(union)  # In case we want multiple old columns names (in different files) to get remapped to the same new column
            union=set(union).difference(set(['']))
            create_table_stmnt = 'original_table varchar,' + ' varchar, '.join(union) + ' varchar'  

        if create_table:
            logger.debug(f'Fetching list of table names in raw')
            cur.execute(f"""select distinct(table_name) from raw.column_mapping where lower(table_name) ~ 'raw."{folder_number}.*{string_pattern}.*';""")
            table_list = [x[0] for x in cur.fetchall()]
            logger.debug(f'Creating joined table with renamed columns. Dropping it if it exists')
            cur.execute(f"""drop table if exists preproc."{joined_name}";""")
            logger.debug(f'Creating table')
            cur.execute(f"""create table if not exists preproc."{joined_name}" ({create_table_stmnt});""")

            # Iterates over all tables in raw (with columns a0, a1, a2...), inserting into a joined preprocessing table (with columns nie, dpto_code_ce, year...)
            logger.debug('Iterating over tables and inserting into joined table, with renamed columns')
            for table in table_list:
                logger.debug(f'    {table}')
                if folder_number in configs['column_mode_1']:
                    cols = col_map[table]
                    if len(cols['new'])>0:
                        year = re.findall('[0-9]{4}', table)[0] # Extracts the year from the file_name
                        logger.debug(f'Inserting into {joined_name}, with year {year}')
                        cur.execute(f"""insert into preproc."{joined_name}" (original_table, year, {','.join(cols['new'])}) select {"'" + table + "' as original_table," + year + ',' + ','.join(cols['mapped'])} from {table};""")                
                else:
                    cur.execute(f"""select mapped_col_name, original_col_name from raw.column_mapping where table_name='{table}' order by mapped_col_name;""")
                    col_pairs = cur.fetchall()
                    raw_cols = []
                    renamed_cols = []
                    for col_pair in col_pairs:
                        new_name=col_pair[1].replace('"', '')
                        if new_name in col_names:
                            if col_names[new_name] != '':
                                raw_cols += [col_pair[0]]
                                renamed_cols += [col_names[new_name]]
                    cmnd = ' varchar, '.join(renamed_cols) + ' varchar'
                    cur.execute(f"""insert into preproc."{joined_name}" ({'original_table,' + ','.join(renamed_cols)}) select {"'" + table[5:-1] + "' as original_table," + ','.join(raw_cols)} from {table};""")
            logger.debug(f'Finished inserting into {joined_name}')
            conn.commit()
            logger.debug('Committed connection')        
Ejemplo n.º 14
0
def cleaning(folder_number):
    """
    Changes the column types of the joined table of folder_number, as well as perform any necessary cleaning on the columns
    The column types are changed following the {folder_number}_column_mapping_types.csv or the {folder_number}_column_mapping.csv columns.
    For boolean columns, it transforms Sí/Si/1 to True and No/0 to False.
    The additional cleaning is done from user-created SQL: SQL_DIR + f'/cleaning_{folder_number}{string_pattern}.sql'
    The format of this SQL is important, it needs to be a series of statements to be included inside a SELECT clause, optionally followed by two newlines and a statement to be included inside a WHERE clause
    If the preproc table has a 'year' column, it gets transformed into a year_range column as a date_range.
    
    Args:
        folder_number (int): the folder number, used for adding column type changes specific to that folder
    """
    folder_number = int(folder_number)
    connection, cursor = create_pgconn()
    cursor.execute('set role el_salvador_mined_education_write')

    with open(BASE_DIR + '/mined/ETL/config.yaml') as config_file:
        configs = yaml.load(config_file)

    if folder_number in configs['join_table_string']:
        string_patterns = configs['join_table_string'][folder_number]
    else:
        string_patterns = ['']

    for string_pattern in string_patterns:
        clean_name = f'{folder_number}_cleaned' + string_pattern
        preproc_name = f'{folder_number}_joined' + string_pattern

        if folder_number in configs['column_mode_1']:
            file_name = f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping_types.csv"""
            indices = (0, 1)
        else:
            file_name = f"""{configs['base_dir']}/{configs['pre_dir']}/{folder_number}_column_mapping.csv"""
            indices = (1, 2)

        if not os.path.isfile(file_name):
            # TODO: make this print out the columns in the preproc table
            logger.error(f'File does not exist: {file_name}')
            cursor.execute(
                f"select column_name from information_schema.columns where table_name='{preproc_name}' and table_schema='preproc';"
            )
            columns = [x[0] for x in cursor.fetchall()]
            col_types = dict()
        else:
            col_file = read_csv(file_name, header=None)
            columns = []
            col_types = dict()
            for idx, row in col_file.iterrows():
                if not isnull(row[indices[0]]):
                    columns += [row[indices[0]]]
                    if not isnull(row[indices[1]]):
                        col_types[row[indices[0]]] = row[indices[1]]
            columns = set(columns)
            columns = list(columns)
            columns.sort()

        # cleaning_{folder_number}.sql contains cleaning conditions and where conditions separated by ('\n\n')
        sql_cmnd = ''
        cleaned_columns = []
        split = []
        if os.path.isfile(SQL_DIR +
                          f'/cleaning_{folder_number}{string_pattern}.sql'):
            with open(SQL_DIR +
                      f'/cleaning_{folder_number}{string_pattern}.sql') as fil:
                sql_file = fil.read()
            split = sql_file.split('\n\n')
            sql_cmnd += split[0]  # add the cleaning conditions to sql_command
            for line in split[0].split(
                    ' as '
            )[1:]:  # retrieves the new column names created by cleaning conditions ['col_name,\n']
                split2 = line.split(',')
                cleaned_columns += [split2[0]]  # retrieve only col_name

        for col in columns:
            if col not in cleaned_columns:
                if col in col_types:
                    if col_types[col] == 'bool':
                        # TODO: generalise this piece of code, so that it reads from the preproc table the two options for the boolean variable
                        sql_cmnd += f"""CASE NULLIF({col},'') 
                                WHEN 'Sí' THEN '1'::bool
                                WHEN 'Si' THEN '1'::bool
                                WHEN 'No' THEN '0'::bool
                                WHEN '1' then '1'::bool
                                WHEN '0' then '0'::bool
                                END as {col},"""
                    elif 'int' in col_types[col]:
                        sql_cmnd += f"""NULLIF({col},'')::numeric::{col_types[col]} as {col},"""
                    else:
                        sql_cmnd += f"""NULLIF({col},'')::{col_types[col]} as {col},"""
                elif col == 'year':
                    sql_cmnd += f"""daterange(to_date(NULLIF(year,''), 'YYYY'), (to_date(NULLIF(year,''), 'YYYY') + interval '1 year')::date, '[)') as year_range,"""
                else:
                    sql_cmnd += f"""NULLIF({col},'') as {col},"""

        sql_cmnd = sql_cmnd.rstrip(',')
        logger.debug('Dropping table')
        cursor.execute(f"""DROP TABLE IF EXISTS cleaned."{clean_name}";""")
        connection.commit()
        logger.debug('Committed connection')

        logger.debug('Creating table')

        if len(split) > 1:
            logger.debug(
                f'\tUsing:\nCREATE TABLE cleaned."{clean_name}" AS SELECT {sql_cmnd} FROM preproc."{preproc_name}" where {split[1]};'
            )
            cursor.execute(
                f"""CREATE TABLE cleaned."{clean_name}" AS SELECT {sql_cmnd} FROM preproc."{preproc_name}" where {split[1]};"""
            )
        else:
            logger.debug(
                f'\tUsing:\nCREATE TABLE cleaned."{clean_name}" AS SELECT {sql_cmnd} FROM preproc."{preproc_name}";'
            )
            cursor.execute(
                f"""CREATE TABLE cleaned."{clean_name}" AS SELECT {sql_cmnd} FROM preproc."{preproc_name}";"""
            )
        connection.commit()
        logger.debug('Committed connection')
Ejemplo n.º 15
0
def calculate_splits(config_dict, experiment, rerun=False):
    """
        Creates a table in staging with each row being a split.

        :param config_dict: dictionary containing data_start, data_end, train_window, test_window, label_window, and step
        :type dict:
        :param experiment: experiment identifier
        :type str:

        :return: data frame containing the experiment and the following time ranges: train_span, test_span, train_label_span, test_label_span
        :rtype: data_frame
    """

    if rerun:
        conn, cur = create_pgconn()
        cur.execute('set role el_salvador_mined_education_write;')
        return pd.read_sql(
            f"""SELECT * FROM results.splits where experiment='{experiment}';""",
            conn)

    logger.debug(config_dict['temporal_splits'])

    split_dict = config_dict['temporal_splits']
    data_start = split_dict['data_starts']
    data_end = split_dict['data_ends']
    index = 0
    for train_window, test_window, label_window, step in product(
            split_dict['train_window'], split_dict['test_window'],
            split_dict['label_window'], split_dict['step']):
        train_window = relativedelta(**parse_date_string(train_window))
        test_window = relativedelta(**parse_date_string(test_window))
        label_window = relativedelta(**parse_date_string(label_window))
        step = relativedelta(**parse_date_string(step))

        test_end = data_end - label_window
        #test_start = test_end - test_window
        test_start = test_end - label_window
        train_end = test_end - label_window

        conn, cur = create_pgconn()
        while train_end > data_start:
            train_start = train_end - train_window
            if train_start < data_start:
                train_start = data_start
            logger.debug('SPLIT')
            logger.debug(
                f'\tTrain span: {train_start} {train_end}\n\tTest span: {test_start} {test_end}\n\tTrain label span: {train_start} {train_end + label_window}\n\tTest label span: {test_start} {test_end + label_window}'
            )
            cur.execute("set role el_salvador_mined_education_write;")
            cur.execute(f"""INSERT INTO results.splits
                (
                split,
                experiment,
                train_span,
                test_span,
                train_label_span,
                test_label_span
                )
            VALUES
                (
                {index},
                '{experiment}',
                daterange('{train_start}', '{train_end}', '[)'),
                daterange('{test_start}', '{test_end}', '[]'),
                daterange('{train_start}', '{train_end + label_window}', '[)'),
                daterange('{test_start}', '{test_end + label_window}', '[)')
                )""")
            index += 1
            test_end -= step
            #test_start = test_end - test_window
            test_start = test_end - label_window
            train_end = test_end - label_window
        conn.commit()
    return pd.read_sql(
        f"""SELECT * FROM results.splits where experiment='{experiment}'""",
        conn)
Ejemplo n.º 16
0
def train_test_model(learner,
                     X_train,
                     y_train,
                     X_test,
                     experiment,
                     split_index,
                     learner_index,
                     model_index,
                     normalize=False):
    '''
        Fits a learner to a train_df, and returns the scores of the test_df.
        Stores the model (experiment, split_index, learner_index) in the database
    '''

    X_test_df = X_test

    if normalize:
        logger.debug(f"Normalize flag TRUE. Normalizing data")
        scaler = normalize_data(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test_df)

    logger.debug(
        f"SPLIT {split_index}, MODEL {model_index}. Training learner {learner}"
    )
    t0 = datetime.datetime.now()
    learner.fit(X=X_train, y=y_train)
    train_time = round(((datetime.datetime.now() - t0).total_seconds()) / 60.0,
                       3)

    logger.debug(
        f"SPLIT {split_index}, MODEL {model_index}. Testing learner {learner}."
    )
    t0 = datetime.datetime.now()
    scores = learner.predict_proba(X=X_test)[:, 1]
    scores_with_index = list(zip(list(X_test_df.index), scores))

    test_time = round(((datetime.datetime.now() - t0).total_seconds()) / 60.0,
                      3)

    conn, cur = create_pgconn()
    logger.debug(
        f"SPLIT {split_index}, MODEL {model_index}. Saving learner {learner} into database"
    )
    cur.execute('set role el_salvador_mined_education_write;')
    cur.execute(f"""INSERT INTO results.models
            (
            model,
            experiment,
            learner,
            split,
            total_training_time,
            total_testing_time
            )
        VALUES
            (
            '{model_index}',
            '{experiment}',
            '{learner_index}',
            '{split_index}',
            '{train_time}',
            '{test_time}'
            )""")
    conn.commit()
    conn.close()
    logger.debug('Connection committed')

    return scores_with_index
Ejemplo n.º 17
0
from mined.utils import create_pgconn, PLOTS_DIR, logger, SQL_DIR

from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
import seaborn as sns

import plotnine
from plotnine import *
from matplotlib import figure

from mined.utils import create_pgconn, PLOTS_DIR

conn, cur = create_pgconn()
cur.execute('set role el_salvador_mined_education_write')


# This function evaluates temporal stability of models
def eval_models(name, experiment_list, model_list, metric, all_models,
                output_type):

    if metric == 'auc':
        models = pd.read_sql(
            f"""select model, experiment, learner, algorithm, hyperparameters, auc as metric, extract(year from lower(test_span))::varchar as year from results.dashboard""",
            conn)
    else:
        k = int(metric.split("_", 1)[1])
        models = pd.read_sql(
            f"""select model, experiment, learner, algorithm, hyperparameters, precision_k[{k+1}] as metric, extract(year from lower(test_span))::varchar as year from results.dashboard""",
Ejemplo n.º 18
0
def get_matrices(split, config, experiment, split_index, debug=False):
    """
        Splits data into train and split, according to a time_range

        :param df: data
        :type df: data_frame
        :param time_range: start and end dates
        :type time_range: range

        :return: train
        :rtype: data_frame
        :return: test
        :rtype: data_frame
    """

    #     features_table = config['features_table']
    grado = config['grado']
    #     feature_zip = list(zip(['features.']*len(config['features']), config['features']))
    #     feature_list = ','.join([''.join(feat) for feat in feature_zip])
    if debug:
        debug_string = '_debug'
    else:
        debug_string = ''

    index_train = split['train_span']
    index_test = split['test_span']
    low_train, high_train = index_train.lower, index_train.upper
    low_test, high_test = index_test.lower, index_test.upper

    feature_groups = config['features']
    all_features = set()
    for features in feature_groups.values():
        all_features = all_features.union(set(features))

    # For each feature group, create a CTE selecting the appropriate features over the appropriate amount of time for training and testing (train_sql and test_sql). After that, all the CTEs are left-joined on year and student, with all_features being selected (select_sql and final_sql). This fails if the different tables have features that are called the same.
    train_tmp = ''
    test_tmp = ''
    train_sql = None
    test_sql = None
    for feature_group, features in feature_groups.items():
        logger.debug(f"Extracting features from {feature_group}")
        train_tmp += f"create temp table {feature_group}_train as (select year_range, student, {','.join(features)} from results.{feature_group}{debug_string} WHERE year_range <@ daterange(quote_literal('{low_train}')::date, quote_literal('{high_train}')::date)); create index {feature_group}_train_idx on {feature_group}_train(year_range, student);"
        test_tmp += f"create temp table {feature_group}_test as (select year_range, student, {','.join(features)} from results.{feature_group}{debug_string} WHERE year_range <@ daterange(quote_literal('{low_test}')::date, quote_literal('{high_test}')::date)); create index {feature_group}_test_idx on {feature_group}_test(year_range, student);"
        if train_sql is None:
            train_sql = f" select year_range, student, {','.join(all_features)}, labels.label as label from {feature_group}_train a"  # 100*random() as random_feature,
            test_sql = f" select year_range, student, {','.join(all_features)}, labels.label as label from {feature_group}_test a"  # 100*random() as random_feature,
        else:
            train_sql += f" left join {feature_group}_train using (year_range, student)"
            test_sql += f" left join {feature_group}_test using (year_range, student)"

    if grado == '*':
        logger.debug('Running all grades')
        train_sql += f" left join staging.labels{debug_string} as labels using (year_range, student) where labels.label is not null;"
        test_sql += f" left join staging.labels{debug_string} as labels using (year_range, student) where labels.label is not null;"
    elif grado != '*':
        train_sql += f" left join staging.labels{debug_string} as labels using (year_range, student) where labels.label is not null and grado_code = {grado};"
        test_sql += f" left join staging.labels{debug_string} as labels using (year_range, student) where labels.label is not null and grado_code = {grado};"
    else:
        raise ValueError('Grado not recognised')
    train_sql = train_tmp + train_sql
    test_sql = test_tmp + test_sql
    conn, cursor = create_pgconn()
    cursor.execute('set role el_salvador_mined_education_write')

    t0 = datetime.datetime.now()
    logger.debug('Retrieving training generator')
    gener = pd.read_sql(train_sql, conn, chunksize=100000)
    logger.debug('Retrieving training dataframe')
    train = pd.concat(gener)
    logger.debug(f'TRAIN TIME CHUNKED: {datetime.datetime.now()-t0}')
    logger.debug(train.info(memory_usage='deep'))
    del gener
    gc.collect()

    t0 = datetime.datetime.now()
    logger.debug('Retrieving testing generator')
    gener = pd.read_sql(test_sql, conn, chunksize=100000)
    logger.debug('Retrieving testing dataframe')
    test = pd.concat(gener)
    logger.debug(f'TEST TIME CHUNKED: {datetime.datetime.now()-t0}')
    logger.debug(test.info(memory_usage='deep'))
    del gener
    gc.collect()

    train.set_index(['year_range', 'student'], inplace=True)
    test.set_index(['year_range', 'student'], inplace=True)

    logger.debug('Training and testing dataframes stored. Connection closed.')

    train_cols = train.columns != train.columns[-1]
    test_cols = train.columns[-1]

    logger.debug("Creating train and test matrix split")
    X_train = train.loc[:, train_cols]
    X_train = X_train.fillna(value=-1)
    y_train = train.loc[:, test_cols]
    y_train = y_train.fillna(value=-1)
    logger.debug(
        f"Train size: X_train={X_train.shape}, y_train={y_train.shape}")

    X_test = test.loc[:, train_cols]
    X_test = X_test.fillna(value=-1)
    y_test = test.loc[:, test_cols]
    y_test = y_test.fillna(value=-1)

    del train, test
    gc.collect()

    logger.debug(f"Test size: X_test={X_test.shape}, y_test={y_test.shape}")

    pkl_list = [
        os.path.join(MATRICES_DIR,
                     str(experiment) + "_" + str(split_index) + '_trainX.pkl'),
        os.path.join(MATRICES_DIR,
                     str(experiment) + "_" + str(split_index) + '_trainY.pkl'),
        os.path.join(MATRICES_DIR,
                     str(experiment) + "_" + str(split_index) + '_testX.pkl'),
        os.path.join(MATRICES_DIR,
                     str(experiment) + "_" + str(split_index) + '_testY.pkl')
    ]
    X_train.to_pickle(pkl_list[0])
    y_train.to_pickle(pkl_list[1])
    X_test.to_pickle(pkl_list[2])
    y_test.to_pickle(pkl_list[3])

    for pkl in pkl_list:
        pkl = f"'{pkl}'"
        file_name = f"'{os.path.basename(pkl)}"
        cursor.execute(
            f"""INSERT INTO results.matrices (experiment, matrix, split)
        VALUES (
            '{experiment}',
            {file_name},
            '{split_index}')""")

    conn.commit()
    conn.close()

    return X_train, y_train, X_test, y_test