def configure_and_run(project, geo, varCorrDict): f = open('%s%s%s%sindexMatrix_99999.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') index_matrix = cPickle.load(f) f.close() state, county, pumano, tract, bg = geo.state, geo.county, geo.puma5, geo.tract, geo.bg print '------------------------------------------------------------------' print 'Geography: County - %s, PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(county, pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host = '%s' %project.db.hostname, user = '******' %project.db.username, passwd = '%s' %project.db.password, db = '%s%s%s' %(project.name, 'scenario', project.scenario), local_infile=1) dbc = db.cursor() tii = time.time() ti = time.time() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select hhid, serialno from gq_sample order by hhid') gq_sample = numpy.asarray(dbc.fetchall(), numpy.int64) gq_units = dbc.rowcount dbc.execute('select hhid, serialno from hhld_sample order by hhid') hhld_sample = numpy.asarray(dbc.fetchall(), numpy.int64) hhld_units = dbc.rowcount dbc.execute('select hhid, serialno, pnum, personuniqueid from person_sample order by hhid, pnum') person_sample = numpy.asarray(dbc.fetchall(), numpy.int64) housing_sample = numpy.vstack((hhld_sample, gq_sample)) housing_units = gq_units + hhld_units # Identifying the control variables for the households, gq's, and persons hhld_control_variables = project.hhldVars gq_control_variables = project.gqVars person_control_variables = project.personVars # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = project.hhldDims gq_dimensions = project.gqDims person_dimensions = project.personDims # Checking marginal totals hhld_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, state, county, tract, bg, project.selVariableDicts.hhldMargsModify) gq_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, state, county, tract, bg) person_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'person', person_control_variables, varCorrDict, project.adjControlsDicts.person, state, county, tract, bg) print 'Step 1A: Checking if the marginals totals are non-zero and if they are consistent across variables...' print '\tChecking household variables' adjusting_sample_joint_distribution.check_marginals(hhld_marginals, hhld_control_variables) print '\tChecking gq variables' adjusting_sample_joint_distribution.check_marginals(gq_marginals, gq_control_variables) print '\tChecking person variables\n' adjusting_sample_joint_distribution.check_marginals(person_marginals, person_control_variables) print 'Step 1B: Checking if the geography has any housing units to synthesize...\n' adjusting_sample_joint_distribution.check_for_zero_housing_totals(hhld_marginals, gq_marginals) print 'Step 1C: Checking if the geography has any persons to synthesize...\n' adjusting_sample_joint_distribution.check_for_zero_person_totals(person_marginals) # Reading the parameters parameters = project.parameters #______________________________________________________________________ # Running IPF for Households print 'Step 2A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf_nosql.ipf_config_run(db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, hhld_dimensions, state, county, pumano, tract, bg, parameters, project.selVariableDicts.hhldMargsModify) print 'IPF procedure for Households completed in %.2f sec \n'%(time.time()-ti) ti = time.time() # Running IPF for GQ print 'Step 2B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf_nosql.ipf_config_run(db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, gq_dimensions, state, county, pumano, tract, bg, parameters) print 'IPF procedure for GQ was completed in %.2f sec \n'%(time.time()-ti) ti = time.time() # Running IPF for Persons print 'Step 2C: Running IPF procedure for Persons... ' person_objective_frequency, person_estimated_constraint = ipf_nosql.ipf_config_run(db, 'person', person_control_variables, varCorrDict, project.adjControlsDicts.person, person_dimensions, state, county, pumano, tract, bg, parameters) print 'IPF procedure for Persons completed in %.2f sec \n'%(time.time()-ti) ti = time.time() #______________________________________________________________________ # Creating the weights array print 'Step 3: Running IPU procedure for obtaining weights that satisfy Household and Person type constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno'%(99999)) result = numpy.asarray(dbc.fetchall())[:,0] weightsDef = numpy.ones((1,housing_units), dtype = float)[0] * -99 weightsDef[result]=1 print 'Number of housing units - %s' %housing_units #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack((hhld_estimated_constraint[:,0], gq_estimated_constraint[:,0], person_estimated_constraint[:,0])) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' %(99999)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weightsDef = numpy.ones((1,housing_units), dtype = float)[0] * -99 weightsDef[result]=1 if project.parameters.ipuProcedure == "ProportionalUpdating": print 'Employing the proportional updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm.heuristic_adjustment(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) elif project.parameters.ipuProcedure == 'EntropyUpdating': print 'Employing the entropy-based updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm.ipu_entropy(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) """ diff = weights - weights1 f = open('weightsComp.csv', 'w') for i in range(housing_units): f.write('%s,%s,%s\n' %(weights[i], weights1[i], diff[i])) f.close() """ print 'IPU procedure was completed in %.2f sec\n'%(time.time()-ti) ti = time.time() #_________________________________________________________________ print 'Step 4: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies(db, 'hhld', hhld_order_dummy, pumano, tract, bg, parameters) gq_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies(db, 'gq', gq_order_dummy, pumano, tract, bg, parameters) frequencies = numpy.hstack((hhld_frequencies[:,0], gq_frequencies[:,0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution ti = time.time() f = open('%s%s%s%spIndexMatrix.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') p_index_matrix = cPickle.load(f) f.close() hhidRowDict = drawing_households.hhid_row_dictionary(housing_sample) # row in the master matrix - hhid rowHhidDict = drawing_households.row_hhid_dictionary(p_index_matrix) # hhid - row in the person index matrix p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while(p_value < parameters.synPopPTol and draw_count < parameters.synPopDraws): draw_count = draw_count + 1 synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0, drawingProcedure=project.parameters.drawingProcedure, iteration=draw_count+1) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties(db, geo, synthetic_housing_units, p_index_matrix, housing_sample, person_sample, hhidRowDict, rowHhidDict) synth_person_stat, count_person, person_estimated_frequency = drawing_households.checking_against_joint_distribution(person_objective_frequency, synthetic_person_attributes, person_dimensions.prod(), pumano, tract, bg) stat = synth_person_stat dof = count_person - 1 if dof == 0: p_value = 1 else: p_value = scipy.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat sp_matrix = None if draw_count >= parameters.synPopDraws: print ('Max Iterations (%d) reached for drawing households with the best draw having a p-value of %.4f' %(parameters.synPopDraws, max_p)) if max_p == 0: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' %(max_p, draw_count) print 'draw_count - %s, pvalue - %s, chi value - %s' %(draw_count, max_p, min_chi) #drawing_households.storing_synthetic_attributes('housing', max_p_housing_attributes, county, tract, bg, project.location, project.name) #drawing_households.storing_synthetic_attributes('person', max_p_person_attributes, county, tract, bg, project.location, project.name) if max_p_housing_attributes.shape[0] < 2500: drawing_households.storing_synthetic_attributes1(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes1(db, 'person', max_p_person_attributes, county, tract, bg) else: drawing_households.storing_synthetic_attributes2(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes2(db, 'person', max_p_person_attributes, county, tract, bg) values = (int(state), int(county), int(tract), int(bg), min_chi, max_p, draw_count, iteration, conv_crit_array[-1]) drawing_households.store_performance_statistics(db, geo, values) print 'Number of Synthetic Household/Group quarters - %d' %((max_p_housing_attributes[:,-2]).sum()) for i in range(len(hhld_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(hhld_control_variables[i], round(sum(hhld_marginals[i]))) for i in range(len(gq_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(gq_control_variables[i], round(sum(gq_marginals[i]))) print 'Number of Synthetic Persons - %d' %((max_p_person_attributes[:,-2]).sum()) for i in range(len(person_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(person_control_variables[i], round(sum(person_marginals[i]))) print 'Synthetic households created for the geography in %.2f\n' %(time.time()-ti) db.commit() dbc.close() db.close() print 'Blockgroup synthesized in %.4f s' %(time.time()-tii)
def configure_and_run(project, geo, varCorrDict): f = open('%s%s%s%sindexMatrix_99999.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') index_matrix = cPickle.load(f) f.close() state, county, pumano, tract, bg = geo.state, geo.county, geo.puma5, geo.tract, geo.bg print '------------------------------------------------------------------' print 'Geography: County - %s, PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(county, pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host = '%s' %project.db.hostname, user = '******' %project.db.username, passwd = '%s' %project.db.password, db = '%s%s%s' %(project.name, 'scenario', project.scenario), local_infile=1) dbc = db.cursor() tii = time.time() ti = time.time() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select hhid, serialno from gq_sample order by hhid') gq_sample = numpy.asarray(dbc.fetchall(), numpy.int64) gq_units = dbc.rowcount dbc.execute('select hhid, serialno from hhld_sample order by hhid') hhld_sample = numpy.asarray(dbc.fetchall(), numpy.int64) hhld_units = dbc.rowcount dbc.execute('select hhid, serialno, pnum, personuniqueid from person_sample order by hhid, pnum') person_sample = numpy.asarray(dbc.fetchall(), numpy.int64) housing_sample = numpy.vstack((hhld_sample, gq_sample)) housing_units = gq_units + hhld_units # Identifying the control variables for the households, gq's hhld_control_variables = project.hhldVars gq_control_variables = project.gqVars # Identifying the number of categories within each control variable for the households, gq's hhld_dimensions = project.hhldDims gq_dimensions = project.gqDims # Checking marginal totals hhld_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, state, county, tract, bg, project.selVariableDicts.hhldMargsModify) gq_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, state, county, tract, bg) print 'Step 1A: Checking if the marginals totals are non-zero and if they are consistent across variables...' print '\tChecking household variables' adjusting_sample_joint_distribution.check_marginals(hhld_marginals, hhld_control_variables) print '\tChecking gq variables\n' adjusting_sample_joint_distribution.check_marginals(gq_marginals, gq_control_variables) print 'Step 1B: Checking if the geography has any housing units to synthesize...\n' adjusting_sample_joint_distribution.check_for_zero_housing_totals(hhld_marginals, gq_marginals) # Reading the parameters parameters = project.parameters #______________________________________________________________________ # Running IPF for Households print 'Step 2A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf_nosql.ipf_config_run(db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, hhld_dimensions, state, county, pumano, tract, bg, parameters, project.selVariableDicts.hhldMargsModify) print 'IPF procedure for Households completed in %.2f sec \n'%(time.time()-ti) ti = time.time() # Running IPF for GQ print 'Step 2B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf_nosql.ipf_config_run(db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, gq_dimensions, state, county, pumano, tract, bg, parameters) print 'IPF procedure for GQ was completed in %.2f sec \n'%(time.time()-ti) ti = time.time() #______________________________________________________________________ # Creating the weights array print 'Step 3: Running IPU procedure for obtaining weights that satisfy Household constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno'%(99999)) result = numpy.asarray(dbc.fetchall())[:,0] print 'Number of housing units - %s' %housing_units #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack((hhld_estimated_constraint[:,0], gq_estimated_constraint[:,0])) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' %(99999)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weightsDef = numpy.ones((1,housing_units), dtype = float)[0] * -99 weightsDef[result]=1 if project.parameters.ipuProcedure == "ProportionalUpdating": print 'Employing the proportional updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm_noper.heuristic_adjustment(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) elif project.parameters.ipuProcedure == 'EntropyUpdating': print 'Employing the entropy-based updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm_noper.ipu_entropy(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) print 'IPU procedure was completed in %.2f sec\n'%(time.time()-ti) ti = time.time() #_________________________________________________________________ print 'Step 4: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies(db, 'hhld', hhld_order_dummy, pumano, tract, bg, parameters) gq_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies(db, 'gq', gq_order_dummy, pumano, tract, bg, parameters) frequencies = numpy.hstack((hhld_frequencies[:,0], gq_frequencies[:,0])) housing_objective_frequency = numpy.hstack((hhld_objective_frequency[:,0], gq_objective_frequency[:,0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution ti = time.time() f = open('%s%s%s%spIndexMatrix.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') p_index_matrix = cPickle.load(f) f.close() print 'pIndexMatrix in - %.4f' %(time.time()-ti) hhidRowDict = drawing_households.hhid_row_dictionary(housing_sample) # row in the master matrix - hhid rowHhidDict = drawing_households.row_hhid_dictionary(p_index_matrix) # hhid - row in the person index matrix p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while(p_value < parameters.synPopPTol and draw_count < parameters.synPopDraws): draw_count = draw_count + 1 #synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0, drawingProcedure=project.parameters.drawingProcedure) synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0, drawingProcedure=project.parameters.drawingProcedure, iteration=draw_count+1) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties(db, geo, synthetic_housing_units, p_index_matrix, housing_sample, person_sample, hhidRowDict, rowHhidDict) synth_housing_stat, count_housing, housing_estimated_frequency = drawing_households.checking_against_joint_distribution(housing_objective_frequency, synthetic_housing_attributes, hhld_dimensions.prod()+gq_dimensions.prod(), pumano, tract, bg) stat = synth_housing_stat dof = count_housing - 1 if dof == 0: p_value = 1 else: p_value = scipy.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat sp_matrix = None if draw_count >= parameters.synPopDraws: print ('Max Iterations (%d) reached for drawing households with the best draw having a p-value of %.4f' %(parameters.synPopDraws, max_p)) if max_p == 0: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' %(max_p, draw_count) print 'draw_count - %s, pvalue - %s, chi value - %s' %(draw_count, max_p, min_chi) #drawing_households.storing_synthetic_attributes('housing', max_p_housing_attributes, county, tract, bg, project.location, project.name) #drawing_households.storing_synthetic_attributes('person', max_p_person_attributes, county, tract, bg, project.location, project.name) if max_p_housing_attributes.shape[0] < 2500: drawing_households.storing_synthetic_attributes1(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes1(db, 'person', max_p_person_attributes, county, tract, bg) else: drawing_households.storing_synthetic_attributes2(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes2(db, 'person', max_p_person_attributes, county, tract, bg) values = (int(state), int(county), int(tract), int(bg), min_chi, max_p, draw_count, iteration, conv_crit_array[-1]) drawing_households.store_performance_statistics(db, geo, values) print 'Number of Synthetic Household/Group quarters - %d' %((max_p_housing_attributes[:,-2]).sum()) for i in range(len(hhld_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(hhld_control_variables[i], round(sum(hhld_marginals[i]))) for i in range(len(gq_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(gq_control_variables[i], round(sum(gq_marginals[i]))) db.commit() dbc.close() db.close() print 'Blockgroup synthesized in %.4f s' %(time.time()-tii)