def create_df(data_type, db_table, pivot=False, index=['age', 'race_ethn', 'sex']): """ Create pandas DataFrame from database SQL query to select base population or rate versions to be used in model. Args: data_type : string type of data (e.g. birth, migration, population) db_table : string database table name pivot : boolean, optional (default False) Returns: df_sql_result : pandas DataFrame SQL query result """ # connect to database using SQLAlchemy db_connection_string = database.get_connection_string( 'model_config.yml', 'in_db') sql_in_engine = create_engine(db_connection_string) # retrieve rate versions for current model and database table names to query rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') tables = util.yaml_to_dict('model_config.yml', 'db_tables') # build query from sql.py # use database table name and rate versions from .yml file in_query = getattr( sql, data_type) % (tables[db_table], rate_versions[data_type]) # pandas DataFrame from query df_sql_result = pd.read_sql(in_query, sql_in_engine) # Special case for migration rates: pivot DataFrame since 4 rates in cols # rates are: domestic in, domestic out, foreign in, foreign out if pivot: df_sql_result = util.apply_pivot(df_sql_result) # create MultiIndex on cohort attributes if index is not None: df_sql_result = df_sql_result.set_index(index) return df_sql_result
def create_df(data_type,db_table,pivot=False): """ Create pandas DataFrame from database SQL query to select base population or rate versions to be used in model. Args: data_type : string type of data (e.g. birth, migration, population) db_table : string database table name pivot : boolean, optional (default False) Returns: df_sql_result : pandas DataFrame SQL query result """ # connect to database using SQLAlchemy db_connection_string = database.get_connection_string('model_config.yml', 'in_db') sql_in_engine = create_engine(db_connection_string) # retrieve rate versions for current model and database table names to query rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') tables = util.yaml_to_dict('model_config.yml', 'db_tables') # build query from sql.py # use database table name and rate versions from .yml file in_query = getattr(sql,data_type) % (tables[db_table],rate_versions[data_type]) # pandas DataFrame from query df_sql_result = pd.read_sql(in_query, sql_in_engine) # Special case for migration rates: pivot DataFrame since 4 rates in cols # rates are: domestic in, domestic out, foreign in, foreign out if pivot: df_sql_result = util.apply_pivot(df_sql_result) # create MultiIndex on cohort attributes df_sql_result = df_sql_result.set_index(['age','race_ethn','sex']) return df_sql_result
def run(self): engine = create_engine( get_connection_string("model_config.yml", 'output_database')) db_connection_string = database.get_connection_string( 'model_config.yml', 'in_db') sql_in_engine = create_engine(db_connection_string) in_query = getattr(sql, 'max_run_id') db_run_id = pd.read_sql(in_query, engine, index_col=None) run_id = pd.Series([db_run_id['max'].iloc[0]]) run_id.to_hdf('temp/data.h5', 'run_id', mode='a') rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') tables = util.yaml_to_dict('model_config.yml', 'db_tables') in_query = getattr( sql, 'inc_mil_gc_pop') % (tables['inc_pop_table'], run_id[0]) in_query2 = getattr(sql, 'inc_mil_hh_pop') % ( tables['population_table'], rate_versions['population']) pop = pd.read_sql(in_query, engine, index_col=['age', 'race_ethn', 'sex']) pop_mil = pd.read_sql(in_query2, sql_in_engine, index_col=['age', 'race_ethn', 'sex']) pop_mil = pop_mil.loc[pop_mil['mildep'] == 'Y'] pop = pop.join(pop_mil) pop.rename(columns={'persons': 'mil_gc_pop'}, inplace=True) pop.rename(columns={'mil_mildep': 'mil_hh_pop'}, inplace=True) pop = pop.reset_index(drop=False) pop = pd.DataFrame(pop[['mil_gc_pop', 'mil_hh_pop']].groupby([pop['yr']]).sum()) pop.to_hdf('temp/data.h5', 'mil_pop', mode='a')
def run(self): my_file = Path('temp/data.h5') if my_file.is_file(): print'File exists' else: db_run_id = log.new_run(name='inc_run_log') run_id = pd.Series([db_run_id]) run_id.to_hdf('temp/data.h5', 'run_id', mode='a') engine = create_engine(get_connection_string("model_config.yml", 'output_database')) db_connection_string = database.get_connection_string('model_config.yml', 'in_db') sql_in_engine = create_engine(db_connection_string) rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') tables = util.yaml_to_dict('model_config.yml', 'db_tables') in_query = getattr(sql, 'inc_pop') % (tables['inc_pop_table'], rate_versions['inc_pop']) in_query2 = getattr(sql, 'inc_pop_mil') % (tables['population_table'], rate_versions['population']) pop = pd.read_sql(in_query, engine, index_col=['age', 'race_ethn', 'sex', 'mildep']) pop_mil = pd.read_sql(in_query2, sql_in_engine, index_col=['age', 'race_ethn', 'sex', 'mildep']) pop = pop.join(pop_mil) pop['persons'] = (pop['persons'] - pop['mil_mildep']) pop = pop.reset_index(drop=False) pop = pop[pop['age'] >= 18] pop['age_cat'] = '' pop.loc[pop['age'].isin(list(range(18, 25))), ['age_cat']] = '18_24' pop.loc[pop['age'].isin(list(range(25, 35))), ['age_cat']] = '25_34' pop.loc[pop['age'].isin(list(range(35, 45))), ['age_cat']] = '35_44' pop.loc[pop['age'].isin(list(range(45, 55))), ['age_cat']] = '45_54' pop.loc[pop['age'].isin(list(range(55, 60))), ['age_cat']] = '55_59' pop.loc[pop['age'].isin(list(range(60, 65))), ['age_cat']] = '60_64' pop.loc[pop['age'].isin(list(range(65, 75))), ['age_cat']] = '65_74' pop.loc[pop['age'].isin(list(range(75, 103))), ['age_cat']] = '75_99' pop = pd.DataFrame(pop['persons'].groupby([pop['yr'], pop['age_cat']]).sum()) pop.to_hdf('temp/data.h5', 'pop', mode='a')
def new_run(name='runs'): Base = declarative_base() table_name = name class Run(Base): __tablename__ = table_name __table_args__ = {'schema': 'defm'} # define columns for the table id = Column(Integer, primary_key=True) base_rate_version = Column(Integer) birth_rate_version = Column(Integer) death_rate_version = Column(Integer) migration_rate_version = Column(Integer) householder_rate_version = Column(Integer) #metadata = MetaData(schema="defm") db_dir = 'results/' if not os.path.exists(db_dir): os.makedirs(db_dir) engine = create_engine( get_connection_string( "model_config.yml", 'output_database')).execution_options(schema_translate_map={ None: "defm", # no schema name -> "defm" }) Base.metadata.schema = 'defm' if not engine.has_table(table_name, schema='defm'): Base.metadata.create_all(engine) db_session = sessionmaker(bind=engine) session = db_session() # Rate versions from yml file rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') # Insert versions in database model_run = Run(base_rate_version=rate_versions['population'], birth_rate_version=rate_versions['birth'], death_rate_version=rate_versions['death'], migration_rate_version=rate_versions['migration'], householder_rate_version=rate_versions['householder']) session.add(model_run) session.commit() run_id = model_run.id return run_id
def run(self): engine = create_engine(get_connection_string("model_config.yml", 'output_database')) db_connection_string = database.get_connection_string('model_config.yml', 'in_db') sql_in_engine = create_engine(db_connection_string) in_query = getattr(sql, 'max_run_id') db_run_id = pd.read_sql(in_query, engine, index_col=None) # db_run_id = log.new_run(name='inc_run_log', run_id=db_run_id['max'].iloc[0]) run_id = pd.Series([db_run_id['id'].iloc[0]]) run_id.to_hdf('temp/data.h5', 'run_id', mode='a') dem_sim_rates = extract.create_df('dem_sim_rates', 'dem_sim_rates_table', rate_id=self.dem_id, index=None) dem_sim_rates.to_hdf('temp/data.h5', 'dem_sim_rates', mode='a') econ_sim_rates = extract.create_df('econ_sim_rates', 'econ_sim_rates_table', rate_id=self.econ_id, index=None) econ_sim_rates.to_hdf('temp/data.h5', 'econ_sim_rates', mode='a') tables = util.yaml_to_dict('model_config.yml', 'db_tables') in_query = getattr(sql, 'inc_pop') % (tables['inc_pop_table'], run_id[0]) in_query2 = getattr(sql, 'inc_mil_hh_pop') % (tables['population_table'], dem_sim_rates.base_population_id[0]) pop = pd.read_sql(in_query, engine, index_col=['age', 'race_ethn', 'sex', 'mildep']) pop_mil = pd.read_sql(in_query2, sql_in_engine, index_col=['age', 'race_ethn', 'sex', 'mildep']) pop = pop.join(pop_mil) pop['persons'] = (pop['persons'] - pop['mil_mildep']) pop = pop.reset_index(drop=False) pop = pop[pop['age'] >= 18] pop['age_cat'] = '' pop.loc[pop['age'].isin(list(range(18, 25))), ['age_cat']] = '18_24' pop.loc[pop['age'].isin(list(range(25, 35))), ['age_cat']] = '25_34' pop.loc[pop['age'].isin(list(range(35, 45))), ['age_cat']] = '35_44' pop.loc[pop['age'].isin(list(range(45, 55))), ['age_cat']] = '45_54' pop.loc[pop['age'].isin(list(range(55, 60))), ['age_cat']] = '55_59' pop.loc[pop['age'].isin(list(range(60, 65))), ['age_cat']] = '60_64' pop.loc[pop['age'].isin(list(range(65, 75))), ['age_cat']] = '65_74' pop.loc[pop['age'].isin(list(range(75, 103))), ['age_cat']] = '75_99' pop = pd.DataFrame(pop['persons'].groupby([pop['yr'], pop['age_cat']]).sum()) pop.to_hdf('temp/data.h5', 'pop', mode='a')
def new_run(db_name): Base = declarative_base() table_name = 'run_log' class Run(Base): __tablename__ = table_name # define columns for the table id = Column(Integer, primary_key=True) base_rate_version = Column(Integer) birth_rate_version = Column(Integer) death_rate_version = Column(Integer) migration_rate_version = Column(Integer) householder_rate_version = Column(Integer) db_dir = 'results/' if not os.path.exists(db_dir): os.makedirs(db_dir) engine = create_engine('sqlite:///' + db_dir + db_name) if not engine.has_table(table_name): Base.metadata.create_all(engine) db_session = sessionmaker(bind=engine) session = db_session() # Rate versions from yml file rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') # Insert versions in database model_run = Run( base_rate_version=rate_versions['population'], birth_rate_version=rate_versions['birth'], death_rate_version=rate_versions['death'], migration_rate_version=rate_versions['migration'], householder_rate_version=rate_versions['householder']) session.add(model_run) session.commit() run_id = model_run.id return run_id
def run(self): birth_rates = pd.read_hdf('temp/data.h5', 'birth_rates') pop = pd.read_hdf('temp/data.h5', 'non_mig_pop') pop = pop[(pop['type'] == 'HHP') & (pop['mildep'] == 'N')] birth_rates = utils.rates_for_yr(pop, birth_rates, self.year) birth_rates = birth_rates[(birth_rates['yr'] == self.year)] rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') random_numbers = extract.create_df('random_numbers', 'random_numbers_table', rate_id=rate_versions['random_numbers']) random_numbers = random_numbers[(random_numbers['yr'] == self.year)] random_numbers = random_numbers[['random_number']] births_per_cohort = cp.births_all(birth_rates, pop_col='non_mig_pop', rand_df=random_numbers) death_rates = pd.read_hdf('temp/data.h5', 'death_rates') death_rates = death_rates[(death_rates['yr'] == self.year)] # sum newborn population across cohorts newborn = cp.births_sum(births_per_cohort, self.year) newborn = newborn.join(death_rates) newborn['new_deaths'] = (newborn['new_born'] * newborn['death_rate']).round() newborn['new_born_survived'] = (newborn['new_born'] - newborn['new_deaths']).round() dead_pop = pd.read_hdf('temp/data.h5', 'dead_pop') dead_pop = dead_pop.join(newborn['new_deaths']) dead_pop = dead_pop.fillna(0) dead_pop['deaths_hhp_non_mil'] = (dead_pop['deaths_hhp_non_mil'] + dead_pop['new_deaths']).round() dead_pop = dead_pop.drop(['new_deaths'], 1) dead_pop.to_hdf('temp/data.h5', 'dead_pop', mode='a') newborn = newborn.drop(['new_deaths', 'death_rate'], 1) newborn.to_hdf('temp/data.h5', 'new_born', mode='a')
# measure script time start_time = time.time() # change to current directory to find .yml input config file full_path = os.path.abspath(inspect.getfile(inspect.currentframe())) os.chdir(os.path.dirname(full_path)) # set console display to show MultiIndex for every row pd.set_option('display.multi_sparse', False) # rate versions to result database & return primary key for table # db_run_id = log.new_run('model_summary.db') db_run_id = log.new_run('defm.db') years = util.yaml_to_dict('model_config.yml', 'years') # Load rates for all years: SQL query to pandas DataFrame # columns: 'age', 'race_ethn', 'sex' (cohort), 'rate', 'year' # pivot migration DataFrame w 4 rates: domestic in & out, foreign in & out birth_rates = extract.create_df('birth', 'rate_table') death_rates = extract.create_df('death', 'rate_table') mig_rates = extract.create_df('migration', 'rate_table', pivot=True) # Load base population: SQL query to pandas DataFrame # columns: 'age', 'race_ethn', 'sex' (cohort), # 'gq.type', 'mildep', 'persons', 'households' population = extract.create_df('population', 'population_table')
def run(self): engine = create_engine( get_connection_string("model_config.yml", 'output_database')) db_connection_string = database.get_connection_string( 'model_config.yml', 'in_db') sql_in_engine = create_engine(db_connection_string) in_query = getattr(sql, 'max_run_id') db_run_id = pd.read_sql(in_query, engine, index_col=None) # db_run_id = log.new_run(name='emp_run_log', run_id=db_run_id['max'].iloc[0]) run_id = pd.Series([db_run_id['max'].iloc[0]]) run_id.to_hdf('temp/data.h5', 'run_id', mode='a') rate_versions = util.yaml_to_dict('model_config.yml', 'rate_versions') tables = util.yaml_to_dict('model_config.yml', 'db_tables') in_query = getattr(sql, 'inc_pop') % (tables['inc_pop_table'], run_id[0]) in_query2 = getattr(sql, 'inc_mil_hh_pop') % ( tables['population_table'], rate_versions['population']) pop = pd.read_sql(in_query, engine, index_col=['age', 'race_ethn', 'sex', 'mildep']) pop_mil = pd.read_sql(in_query2, sql_in_engine, index_col=['age', 'race_ethn', 'sex', 'mildep']) pop = pop.join(pop_mil) pop['persons'] = (pop['persons'] - pop['mil_mildep']) pop = pop.reset_index(drop=False) pop['age_cat'] = '' pop.loc[pop['age'].isin(list(range(0, 5))), ['age_cat']] = '00_04' pop.loc[pop['age'].isin(list(range(5, 10))), ['age_cat']] = '05_09' pop.loc[pop['age'].isin(list(range(10, 15))), ['age_cat']] = '10_14' pop.loc[pop['age'].isin(list(range(15, 18))), ['age_cat']] = '15_17' pop.loc[pop['age'].isin(list(range(18, 20))), ['age_cat']] = '18_19' pop.loc[pop['age'].isin(list(range(20, 21))), ['age_cat']] = '20_20' pop.loc[pop['age'].isin(list(range(21, 22))), ['age_cat']] = '21_21' pop.loc[pop['age'].isin(list(range(22, 25))), ['age_cat']] = '22_24' pop.loc[pop['age'].isin(list(range(25, 30))), ['age_cat']] = '25_29' pop.loc[pop['age'].isin(list(range(30, 35))), ['age_cat']] = '30_34' pop.loc[pop['age'].isin(list(range(35, 40))), ['age_cat']] = '35_39' pop.loc[pop['age'].isin(list(range(40, 45))), ['age_cat']] = '40_44' pop.loc[pop['age'].isin(list(range(45, 50))), ['age_cat']] = '45_49' pop.loc[pop['age'].isin(list(range(50, 55))), ['age_cat']] = '50_54' pop.loc[pop['age'].isin(list(range(55, 60))), ['age_cat']] = '55_59' pop.loc[pop['age'].isin(list(range(60, 62))), ['age_cat']] = '60_61' pop.loc[pop['age'].isin(list(range(62, 65))), ['age_cat']] = '62_64' pop.loc[pop['age'].isin(list(range(65, 67))), ['age_cat']] = '65_66' pop.loc[pop['age'].isin(list(range(67, 70))), ['age_cat']] = '67_69' pop.loc[pop['age'].isin(list(range(70, 75))), ['age_cat']] = '70_74' pop.loc[pop['age'].isin(list(range(75, 80))), ['age_cat']] = '75_79' pop.loc[pop['age'].isin(list(range(80, 85))), ['age_cat']] = '80_84' pop.loc[pop['age'].isin(list(range(85, 103))), ['age_cat']] = '85_99' pop = pd.DataFrame(pop['persons'].groupby( [pop['yr'], pop['age_cat'], pop['sex'], pop['race_ethn']]).sum()) pop.to_hdf('temp/data.h5', 'pop', mode='a')
# measure script time start_time = time.time() # change to current directory to find .yml input config file full_path = os.path.abspath(inspect.getfile(inspect.currentframe())) os.chdir(os.path.dirname(full_path)) # set console display to show MultiIndex for every row pd.set_option('display.multi_sparse', False) # rate versions to result database & return primary key for table # db_run_id = log.new_run('model_summary.db') db_run_id = log.new_run() years = util.yaml_to_dict('model_config.yml', 'years') # Load rates for all years: SQL query to pandas DataFrame # columns: 'age', 'race_ethn', 'sex' (cohort), 'rate', 'year' # pivot migration DataFrame w 4 rates: domestic in & out, foreign in & out birth_rates = extract.create_df('birth', 'rate_table') death_rates = extract.create_df('death', 'rate_table') mig_rates = extract.create_df('migration', 'rate_table', pivot=True) # Load base population: SQL query to pandas DataFrame # columns: 'age', 'race_ethn', 'sex' (cohort), # 'gq.type', 'mildep', 'persons', 'households' population = extract.create_df('population', 'population_table')
def requires(self): years = util.yaml_to_dict('model_config.yml', 'years') return [ExportTables(y) for y in range(years['y1'], years['yf'] + 1)]