def configure_and_run(index_matrix, p_index_matrix, geoid): pumano = int(geoid[0]) tract = int(geoid[1]) bg = int(geoid[2]) print '------------------------------------------------------------------' print 'Geography: PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host = 'localhost', user = '******', passwd = '1234', db = 'ncpopsyn') dbc = db.cursor() tii = time.clock() ti = time.clock() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select * from housing_pums') housing_pums = numpy.asarray(dbc.fetchall())[:,1:] housing_units = dbc.rowcount # Identifying the control variables for the households, gq's, and persons hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'hhld') gq_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'gq') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'person') # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'hhld', hhld_control_variables)) gq_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'gq', gq_control_variables)) person_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'person', person_control_variables)) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' %(0)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running IPF for Households print 'Step 1A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf.ipf_config_run(db, 'hhld', hhld_control_variables, hhld_dimensions, pumano, tract, bg) print 'IPF procedure for Households completed in %.2f sec \n'%(time.clock()-ti) ti = time.clock() # Running IPF for GQ print 'Step 1B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf.ipf_config_run(db, 'gq', gq_control_variables, gq_dimensions, pumano, tract, bg) print 'IPF procedure for GQ was completed in %.2f sec \n'%(time.clock()-ti) ti = time.clock() # Running IPF for Persons print 'Step 1C: Running IPF procedure for Persons... ' person_objective_frequency, person_estimated_constraint = ipf.ipf_config_run(db, 'person', person_control_variables, person_dimensions, pumano, tract, bg) print 'IPF procedure for Persons completed in %.2f sec \n'%(time.clock()-ti) ti = time.clock() #______________________________________________________________________ # Creating the weights array print 'Step 2: Running IPU procedure for obtaining weights that satisfy Household and Person type constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno'%(0)) result = numpy.asarray(dbc.fetchall())[:,0] weights = numpy.ones((1,housing_units), dtype = float)[0] * -99 weights[result]=1 #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack((hhld_estimated_constraint[:,0], gq_estimated_constraint[:,0], person_estimated_constraint[:,0])) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weights, conv_crit_array, wts_array = heuristic_algorithm.heuristic_adjustment(db, 0, index_matrix, weights, total_constraint, sp_matrix) print 'IPU procedure was completed in %.2f sec\n'%(time.clock()-ti) ti = time.clock() #_________________________________________________________________ print 'Step 3: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string(hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies(db, 'hhld', hhld_order_dummy, pumano, tract, bg) gq_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string(gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies(db, 'gq', gq_order_dummy, pumano, tract, bg) frequencies = numpy.hstack((hhld_frequencies[:,0], gq_frequencies[:,0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution dbc.execute('select * from person_pums') person_pums = numpy.asarray(dbc.fetchall())[:,1:] p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while(p_value < 0.9999 and draw_count < 25): draw_count = draw_count + 1 synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties(db, synthetic_housing_units, p_index_matrix, housing_pums, person_pums) synth_person_stat, count_person, person_estimated_frequency = drawing_households.checking_against_joint_distribution(person_objective_frequency, synthetic_person_attributes, person_dimensions, pumano, tract, bg) stat = synth_person_stat dof = count_person - 1 p_value = scipy.stats.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat if draw_count >=25: print 'Max Iterations reached for drawing households with the best draw having a p-value of %.4f' %(max_p) else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' %(max_p, draw_count) drawing_households.storing_synthetic_attributes(db, 'housing', max_p_housing_attributes, pumano, tract, bg) drawing_households.storing_synthetic_attributes(db, 'person', max_p_person_attributes, pumano, tract, bg) dbc.execute('select hhtotal from housing_marginals where pumano = %s and tract = %s and bg = %s'%(pumano, tract, bg)) housingtotal = dbc.fetchall()[0][0] dbc.execute('select sum(gender1 + gender2) from person_marginals where pumano = %s and tract = %s and bg = %s'%(pumano, tract, bg)) persontotal = dbc.fetchall()[0][0] print 'Number of Synthetic Household - %d, and given Household total from the Census SF - %d' %(sum(max_p_housing_attributes[:,-2]), housingtotal) print 'Number of Synthetic Persons - %d and given Person total from the Census SF - %d' %(sum(max_p_person_attributes[:,-1]), persontotal) print 'Synthetic households created for the geography in %.2f\n' %(time.clock()-ti) db.commit() dbc.close() db.close()
def configure_and_run(project, geo, varCorrDict): f = open('%s%s%s%sindexMatrix_99999.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') index_matrix = cPickle.load(f) f.close() state, county, pumano, tract, bg = geo.state, geo.county, geo.puma5, geo.tract, geo.bg print '------------------------------------------------------------------' print 'Geography: County - %s, PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(county, pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host = '%s' %project.db.hostname, user = '******' %project.db.username, passwd = '%s' %project.db.password, db = '%s%s%s' %(project.name, 'scenario', project.scenario), local_infile=1) dbc = db.cursor() tii = time.time() ti = time.time() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select hhid, serialno from gq_sample order by hhid') gq_sample = numpy.asarray(dbc.fetchall(), numpy.int64) gq_units = dbc.rowcount dbc.execute('select hhid, serialno from hhld_sample order by hhid') hhld_sample = numpy.asarray(dbc.fetchall(), numpy.int64) hhld_units = dbc.rowcount dbc.execute('select hhid, serialno, pnum, personuniqueid from person_sample order by hhid, pnum') person_sample = numpy.asarray(dbc.fetchall(), numpy.int64) housing_sample = numpy.vstack((hhld_sample, gq_sample)) housing_units = gq_units + hhld_units # Identifying the control variables for the households, gq's, and persons hhld_control_variables = project.hhldVars gq_control_variables = project.gqVars person_control_variables = project.personVars # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = project.hhldDims gq_dimensions = project.gqDims person_dimensions = project.personDims # Checking marginal totals hhld_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, state, county, tract, bg, project.selVariableDicts.hhldMargsModify) gq_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, state, county, tract, bg) person_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'person', person_control_variables, varCorrDict, project.adjControlsDicts.person, state, county, tract, bg) print 'Step 1A: Checking if the marginals totals are non-zero and if they are consistent across variables...' print '\tChecking household variables' adjusting_sample_joint_distribution.check_marginals(hhld_marginals, hhld_control_variables) print '\tChecking gq variables' adjusting_sample_joint_distribution.check_marginals(gq_marginals, gq_control_variables) print '\tChecking person variables\n' adjusting_sample_joint_distribution.check_marginals(person_marginals, person_control_variables) print 'Step 1B: Checking if the geography has any housing units to synthesize...\n' adjusting_sample_joint_distribution.check_for_zero_housing_totals(hhld_marginals, gq_marginals) print 'Step 1C: Checking if the geography has any persons to synthesize...\n' adjusting_sample_joint_distribution.check_for_zero_person_totals(person_marginals) # Reading the parameters parameters = project.parameters #______________________________________________________________________ # Running IPF for Households print 'Step 2A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf_nosql.ipf_config_run(db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, hhld_dimensions, state, county, pumano, tract, bg, parameters, project.selVariableDicts.hhldMargsModify) print 'IPF procedure for Households completed in %.2f sec \n'%(time.time()-ti) ti = time.time() # Running IPF for GQ print 'Step 2B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf_nosql.ipf_config_run(db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, gq_dimensions, state, county, pumano, tract, bg, parameters) print 'IPF procedure for GQ was completed in %.2f sec \n'%(time.time()-ti) ti = time.time() # Running IPF for Persons print 'Step 2C: Running IPF procedure for Persons... ' person_objective_frequency, person_estimated_constraint = ipf_nosql.ipf_config_run(db, 'person', person_control_variables, varCorrDict, project.adjControlsDicts.person, person_dimensions, state, county, pumano, tract, bg, parameters) print 'IPF procedure for Persons completed in %.2f sec \n'%(time.time()-ti) ti = time.time() #______________________________________________________________________ # Creating the weights array print 'Step 3: Running IPU procedure for obtaining weights that satisfy Household and Person type constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno'%(99999)) result = numpy.asarray(dbc.fetchall())[:,0] weightsDef = numpy.ones((1,housing_units), dtype = float)[0] * -99 weightsDef[result]=1 print 'Number of housing units - %s' %housing_units #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack((hhld_estimated_constraint[:,0], gq_estimated_constraint[:,0], person_estimated_constraint[:,0])) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' %(99999)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weightsDef = numpy.ones((1,housing_units), dtype = float)[0] * -99 weightsDef[result]=1 if project.parameters.ipuProcedure == "ProportionalUpdating": print 'Employing the proportional updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm.heuristic_adjustment(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) elif project.parameters.ipuProcedure == 'EntropyUpdating': print 'Employing the entropy-based updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm.ipu_entropy(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) """ diff = weights - weights1 f = open('weightsComp.csv', 'w') for i in range(housing_units): f.write('%s,%s,%s\n' %(weights[i], weights1[i], diff[i])) f.close() """ print 'IPU procedure was completed in %.2f sec\n'%(time.time()-ti) ti = time.time() #_________________________________________________________________ print 'Step 4: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies(db, 'hhld', hhld_order_dummy, pumano, tract, bg, parameters) gq_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies(db, 'gq', gq_order_dummy, pumano, tract, bg, parameters) frequencies = numpy.hstack((hhld_frequencies[:,0], gq_frequencies[:,0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution ti = time.time() f = open('%s%s%s%spIndexMatrix.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') p_index_matrix = cPickle.load(f) f.close() hhidRowDict = drawing_households.hhid_row_dictionary(housing_sample) # row in the master matrix - hhid rowHhidDict = drawing_households.row_hhid_dictionary(p_index_matrix) # hhid - row in the person index matrix p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while(p_value < parameters.synPopPTol and draw_count < parameters.synPopDraws): draw_count = draw_count + 1 synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0, drawingProcedure=project.parameters.drawingProcedure, iteration=draw_count+1) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties(db, geo, synthetic_housing_units, p_index_matrix, housing_sample, person_sample, hhidRowDict, rowHhidDict) synth_person_stat, count_person, person_estimated_frequency = drawing_households.checking_against_joint_distribution(person_objective_frequency, synthetic_person_attributes, person_dimensions.prod(), pumano, tract, bg) stat = synth_person_stat dof = count_person - 1 if dof == 0: p_value = 1 else: p_value = scipy.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat sp_matrix = None if draw_count >= parameters.synPopDraws: print ('Max Iterations (%d) reached for drawing households with the best draw having a p-value of %.4f' %(parameters.synPopDraws, max_p)) if max_p == 0: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' %(max_p, draw_count) print 'draw_count - %s, pvalue - %s, chi value - %s' %(draw_count, max_p, min_chi) #drawing_households.storing_synthetic_attributes('housing', max_p_housing_attributes, county, tract, bg, project.location, project.name) #drawing_households.storing_synthetic_attributes('person', max_p_person_attributes, county, tract, bg, project.location, project.name) if max_p_housing_attributes.shape[0] < 2500: drawing_households.storing_synthetic_attributes1(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes1(db, 'person', max_p_person_attributes, county, tract, bg) else: drawing_households.storing_synthetic_attributes2(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes2(db, 'person', max_p_person_attributes, county, tract, bg) values = (int(state), int(county), int(tract), int(bg), min_chi, max_p, draw_count, iteration, conv_crit_array[-1]) drawing_households.store_performance_statistics(db, geo, values) print 'Number of Synthetic Household/Group quarters - %d' %((max_p_housing_attributes[:,-2]).sum()) for i in range(len(hhld_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(hhld_control_variables[i], round(sum(hhld_marginals[i]))) for i in range(len(gq_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(gq_control_variables[i], round(sum(gq_marginals[i]))) print 'Number of Synthetic Persons - %d' %((max_p_person_attributes[:,-2]).sum()) for i in range(len(person_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(person_control_variables[i], round(sum(person_marginals[i]))) print 'Synthetic households created for the geography in %.2f\n' %(time.time()-ti) db.commit() dbc.close() db.close() print 'Blockgroup synthesized in %.4f s' %(time.time()-tii)
def configure_and_run(index_matrix, p_index_matrix, geoid): pumano = int(geoid[0]) tract = int(geoid[1]) bg = int(geoid[2]) print '------------------------------------------------------------------' print 'Geography: PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host='localhost', user='******', passwd='1234', db='ncpopsyn') dbc = db.cursor() tii = time.clock() ti = time.clock() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select * from housing_pums') housing_pums = numpy.asarray(dbc.fetchall())[:, 1:] housing_units = dbc.rowcount # Identifying the control variables for the households, gq's, and persons hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'hhld') gq_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'gq') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'person') # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'hhld', hhld_control_variables)) gq_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'gq', gq_control_variables)) person_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'person', person_control_variables)) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' % (0)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running IPF for Households print 'Step 1A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf.ipf_config_run( db, 'hhld', hhld_control_variables, hhld_dimensions, pumano, tract, bg) print 'IPF procedure for Households completed in %.2f sec \n' % ( time.clock() - ti) ti = time.clock() # Running IPF for GQ print 'Step 1B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf.ipf_config_run( db, 'gq', gq_control_variables, gq_dimensions, pumano, tract, bg) print 'IPF procedure for GQ was completed in %.2f sec \n' % (time.clock() - ti) ti = time.clock() # Running IPF for Persons print 'Step 1C: Running IPF procedure for Persons... ' person_objective_frequency, person_estimated_constraint = ipf.ipf_config_run( db, 'person', person_control_variables, person_dimensions, pumano, tract, bg) print 'IPF procedure for Persons completed in %.2f sec \n' % ( time.clock() - ti) ti = time.clock() #______________________________________________________________________ # Creating the weights array print 'Step 2: Running IPU procedure for obtaining weights that satisfy Household and Person type constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno' % (0)) result = numpy.asarray(dbc.fetchall())[:, 0] weights = numpy.ones((1, housing_units), dtype=float)[0] * -99 weights[result] = 1 #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack( (hhld_estimated_constraint[:, 0], gq_estimated_constraint[:, 0], person_estimated_constraint[:, 0])) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weights, conv_crit_array, wts_array = heuristic_algorithm.heuristic_adjustment( db, 0, index_matrix, weights, total_constraint, sp_matrix) print 'IPU procedure was completed in %.2f sec\n' % (time.clock() - ti) ti = time.clock() #_________________________________________________________________ print 'Step 3: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string( hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies( db, 'hhld', hhld_order_dummy, pumano, tract, bg) gq_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string( gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies( db, 'gq', gq_order_dummy, pumano, tract, bg) frequencies = numpy.hstack((hhld_frequencies[:, 0], gq_frequencies[:, 0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution dbc.execute('select * from person_pums') person_pums = numpy.asarray(dbc.fetchall())[:, 1:] p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while (p_value < 0.9999 and draw_count < 25): draw_count = draw_count + 1 synthetic_housing_units = drawing_households.drawing_housing_units( db, frequencies, weights, index_matrix, sp_matrix, 0) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties( db, synthetic_housing_units, p_index_matrix, housing_pums, person_pums) synth_person_stat, count_person, person_estimated_frequency = drawing_households.checking_against_joint_distribution( person_objective_frequency, synthetic_person_attributes, person_dimensions, pumano, tract, bg) stat = synth_person_stat dof = count_person - 1 p_value = scipy.stats.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat if draw_count >= 25: print 'Max Iterations reached for drawing households with the best draw having a p-value of %.4f' % ( max_p) else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' % ( max_p, draw_count) drawing_households.storing_synthetic_attributes(db, 'housing', max_p_housing_attributes, pumano, tract, bg) drawing_households.storing_synthetic_attributes(db, 'person', max_p_person_attributes, pumano, tract, bg) dbc.execute( 'select hhtotal from housing_marginals where pumano = %s and tract = %s and bg = %s' % (pumano, tract, bg)) housingtotal = dbc.fetchall()[0][0] dbc.execute( 'select sum(gender1 + gender2) from person_marginals where pumano = %s and tract = %s and bg = %s' % (pumano, tract, bg)) persontotal = dbc.fetchall()[0][0] print 'Number of Synthetic Household - %d, and given Household total from the Census SF - %d' % ( sum(max_p_housing_attributes[:, -2]), housingtotal) print 'Number of Synthetic Persons - %d and given Person total from the Census SF - %d' % ( sum(max_p_person_attributes[:, -1]), persontotal) print 'Synthetic households created for the geography in %.2f\n' % ( time.clock() - ti) db.commit() dbc.close() db.close()
def configure_and_run(project, geo, varCorrDict): f = open('%s%s%s%sindexMatrix_99999.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') index_matrix = cPickle.load(f) f.close() state, county, pumano, tract, bg = geo.state, geo.county, geo.puma5, geo.tract, geo.bg print '------------------------------------------------------------------' print 'Geography: County - %s, PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(county, pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host = '%s' %project.db.hostname, user = '******' %project.db.username, passwd = '%s' %project.db.password, db = '%s%s%s' %(project.name, 'scenario', project.scenario), local_infile=1) dbc = db.cursor() tii = time.time() ti = time.time() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select hhid, serialno from gq_sample order by hhid') gq_sample = numpy.asarray(dbc.fetchall(), numpy.int64) gq_units = dbc.rowcount dbc.execute('select hhid, serialno from hhld_sample order by hhid') hhld_sample = numpy.asarray(dbc.fetchall(), numpy.int64) hhld_units = dbc.rowcount dbc.execute('select hhid, serialno, pnum, personuniqueid from person_sample order by hhid, pnum') person_sample = numpy.asarray(dbc.fetchall(), numpy.int64) housing_sample = numpy.vstack((hhld_sample, gq_sample)) housing_units = gq_units + hhld_units # Identifying the control variables for the households, gq's hhld_control_variables = project.hhldVars gq_control_variables = project.gqVars # Identifying the number of categories within each control variable for the households, gq's hhld_dimensions = project.hhldDims gq_dimensions = project.gqDims # Checking marginal totals hhld_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, state, county, tract, bg, project.selVariableDicts.hhldMargsModify) gq_marginals = adjusting_sample_joint_distribution.prepare_control_marginals (db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, state, county, tract, bg) print 'Step 1A: Checking if the marginals totals are non-zero and if they are consistent across variables...' print '\tChecking household variables' adjusting_sample_joint_distribution.check_marginals(hhld_marginals, hhld_control_variables) print '\tChecking gq variables\n' adjusting_sample_joint_distribution.check_marginals(gq_marginals, gq_control_variables) print 'Step 1B: Checking if the geography has any housing units to synthesize...\n' adjusting_sample_joint_distribution.check_for_zero_housing_totals(hhld_marginals, gq_marginals) # Reading the parameters parameters = project.parameters #______________________________________________________________________ # Running IPF for Households print 'Step 2A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf_nosql.ipf_config_run(db, 'hhld', hhld_control_variables, varCorrDict, project.adjControlsDicts.hhld, hhld_dimensions, state, county, pumano, tract, bg, parameters, project.selVariableDicts.hhldMargsModify) print 'IPF procedure for Households completed in %.2f sec \n'%(time.time()-ti) ti = time.time() # Running IPF for GQ print 'Step 2B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf_nosql.ipf_config_run(db, 'gq', gq_control_variables, varCorrDict, project.adjControlsDicts.gq, gq_dimensions, state, county, pumano, tract, bg, parameters) print 'IPF procedure for GQ was completed in %.2f sec \n'%(time.time()-ti) ti = time.time() #______________________________________________________________________ # Creating the weights array print 'Step 3: Running IPU procedure for obtaining weights that satisfy Household constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno'%(99999)) result = numpy.asarray(dbc.fetchall())[:,0] print 'Number of housing units - %s' %housing_units #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack((hhld_estimated_constraint[:,0], gq_estimated_constraint[:,0])) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' %(99999)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weightsDef = numpy.ones((1,housing_units), dtype = float)[0] * -99 weightsDef[result]=1 if project.parameters.ipuProcedure == "ProportionalUpdating": print 'Employing the proportional updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm_noper.heuristic_adjustment(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) elif project.parameters.ipuProcedure == 'EntropyUpdating': print 'Employing the entropy-based updating procedure for reallocating sample weights', project.parameters.ipuProcedure iteration, weights, conv_crit_array, wts_array = heuristic_algorithm_noper.ipu_entropy(db, 0, index_matrix, weightsDef, total_constraint, sp_matrix, parameters) print 'IPU procedure was completed in %.2f sec\n'%(time.time()-ti) ti = time.time() #_________________________________________________________________ print 'Step 4: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies(db, 'hhld', hhld_order_dummy, pumano, tract, bg, parameters) gq_order_dummy = adjusting_sample_joint_distribution.create_aggregation_string(gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies(db, 'gq', gq_order_dummy, pumano, tract, bg, parameters) frequencies = numpy.hstack((hhld_frequencies[:,0], gq_frequencies[:,0])) housing_objective_frequency = numpy.hstack((hhld_objective_frequency[:,0], gq_objective_frequency[:,0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution ti = time.time() f = open('%s%s%s%spIndexMatrix.pkl'%(project.location, os.path.sep, project.name, os.path.sep), 'rb') p_index_matrix = cPickle.load(f) f.close() print 'pIndexMatrix in - %.4f' %(time.time()-ti) hhidRowDict = drawing_households.hhid_row_dictionary(housing_sample) # row in the master matrix - hhid rowHhidDict = drawing_households.row_hhid_dictionary(p_index_matrix) # hhid - row in the person index matrix p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while(p_value < parameters.synPopPTol and draw_count < parameters.synPopDraws): draw_count = draw_count + 1 #synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0, drawingProcedure=project.parameters.drawingProcedure) synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0, drawingProcedure=project.parameters.drawingProcedure, iteration=draw_count+1) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties(db, geo, synthetic_housing_units, p_index_matrix, housing_sample, person_sample, hhidRowDict, rowHhidDict) synth_housing_stat, count_housing, housing_estimated_frequency = drawing_households.checking_against_joint_distribution(housing_objective_frequency, synthetic_housing_attributes, hhld_dimensions.prod()+gq_dimensions.prod(), pumano, tract, bg) stat = synth_housing_stat dof = count_housing - 1 if dof == 0: p_value = 1 else: p_value = scipy.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat sp_matrix = None if draw_count >= parameters.synPopDraws: print ('Max Iterations (%d) reached for drawing households with the best draw having a p-value of %.4f' %(parameters.synPopDraws, max_p)) if max_p == 0: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' %(max_p, draw_count) print 'draw_count - %s, pvalue - %s, chi value - %s' %(draw_count, max_p, min_chi) #drawing_households.storing_synthetic_attributes('housing', max_p_housing_attributes, county, tract, bg, project.location, project.name) #drawing_households.storing_synthetic_attributes('person', max_p_person_attributes, county, tract, bg, project.location, project.name) if max_p_housing_attributes.shape[0] < 2500: drawing_households.storing_synthetic_attributes1(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes1(db, 'person', max_p_person_attributes, county, tract, bg) else: drawing_households.storing_synthetic_attributes2(db, 'housing', max_p_housing_attributes, county, tract, bg) drawing_households.storing_synthetic_attributes2(db, 'person', max_p_person_attributes, county, tract, bg) values = (int(state), int(county), int(tract), int(bg), min_chi, max_p, draw_count, iteration, conv_crit_array[-1]) drawing_households.store_performance_statistics(db, geo, values) print 'Number of Synthetic Household/Group quarters - %d' %((max_p_housing_attributes[:,-2]).sum()) for i in range(len(hhld_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(hhld_control_variables[i], round(sum(hhld_marginals[i]))) for i in range(len(gq_control_variables)): print '%s variable\'s marginal distribution sum is %d' %(gq_control_variables[i], round(sum(gq_marginals[i]))) db.commit() dbc.close() db.close() print 'Blockgroup synthesized in %.4f s' %(time.time()-tii)