def mergeCancer_County():

	# allCanMrg = readAllCancer_County()
	acsCounty = popData('county')
	indivCanMrg = readIndivCancer_County()
	indivCanMrgPop = pd.merge(indivCanMrg, acsCounty[['countyFIPS', 'totPop']], left_on = 'countyCode', right_on = 'countyFIPS')
	canCols = indivCanMrgPop.columns.values[1:-3]
	for i in canCols:
		indivCanMrgPop[i+'_Per100k'] = indivCanMrgPop[i]*100000/indivCanMrgPop['totPop']
	indivCanMrgPop.drop(canCols, inplace=True,axis=1)
	indivCanMrgPop['geoid5'] = '36'+indivCanMrgPop['countyFIPS']
	newCols = [x for x in list(indivCanMrgPop.columns.values) if x not in ['countyCode', 'countyFIPS', 'totPop', 'countyName', 'geoid5']]
	for x in ['totPop', 'countyName', 'countyFIPS', 'geoid5']:
		newCols.insert(0, x)
	indivCanMrgPop = indivCanMrgPop[newCols]
	return indivCanMrgPop
Esempio n. 2
0
def main_CensusTract():
	# Import air data
	airEmissions = read_airEmissions_CensusTract()
	#print airEmissions['n_5_1_fugitive_air_dioxin'].describe()

	# Import cancer data
	# allCancer = readIndivCancer_CensusTract()  
	allCancer = mergeCancer_Tract()
	#allCancer.to_csv('/Users/Steve/Github/ny_cancerAnalysis/data/NYSDOH_CancerMapping_Data_2005_2009/allCancer.csv')

	# Import smoking data
	smoking = readSmoking()   

	# Import census tract level population data
	acsTract = popData('tract')

	# Import cancer risk data
	cancerRisk = read_cancerRisk_CensusTract()

	# Join air emission data with cancer rates data
	data_merged = pd.merge(allCancer, airEmissions, how='left', left_on = 'geoid11', right_on = 'geoid')
	data_merged.fillna(0, inplace=True)  # To avoid losing those tracts in model with smoking and demographic data but no chemical releases 
	data_merged = data_merged.drop('geoid', 1)	
	data_merged['countyCode'] = data_merged['tractFIPS'].str[:3]
	data_merged = pd.merge(data_merged, smoking, left_on = 'countyCode', right_on = 'cCode')
	data_merged = pd.merge(data_merged, acsTract, left_on = 'geoid11', right_on = 'Geo_FIPS')
	data_merged = pd.merge(data_merged, cancerRisk, left_on = 'Geo_FIPS', right_on = 'GEOID')

	# Produce correlation table
	correlation_table = data_merged.corr()
	correlation_table.to_csv('data/CorrelationTable/censusTract_correlationTable.csv')

	cancer_list = ['observed_Bladder_Per100k', 'observed_Bone_Per100k',	'observed_Brain_Per100k', 
	'observed_Breast_Per100k', 'observed_Colorectal_Per100k','observed_Esophagus_Per100k', 'observed_Kidney_Per100k', 
	'observed_Larynx_Per100k', 'observed_Leukemia_Per100k',	'observed_Liver_Per100k', 'observed_Lung_Per100k',	
	'observed_Mesothelioma_Per100k', 'observed_NHL_Per100k', 'observed_Nasal_Per100k', 'observed_Oral_Per100k',	
	'observed_Other_Per100k', 'observed_Ovary_Per100k',	'observed_Pancreas_Per100k', 'observed_Prostate_Per100k',	
	'observed_Soft_Tissue_Per100k',	'observed_Stomach_Per100k',	'observed_Testis_Per100k', 'observed_Thyroid_Per100k',	
	'observed_Uterus_Per100k', 'observed_Total_Per100k']

	chemical_list = ['n_5_1_fugitive_air', 'n_5_2_stack_air', 'airTotal', 'n_5_1_fugitive_air_benzene',
	'n_5_2_stack_air_benzene', 'benzeneTotal', 'n_5_1_fugitive_air_toluene', 'n_5_2_stack_air_toluene', 
	'tolueneTotal', 'n_5_1_fugitive_air_ethylbenzene', 'n_5_2_stack_air_ethylbenzene', 'ethylbenzeneTotal',
	'n_5_1_fugitive_air_xylene', 'n_5_2_stack_air_xylene', 'xyleneTotal', 'n_5_1_fugitive_air_formaldehyde',
	'n_5_2_stack_air_formaldehyde', 'formaldehydeTotal', 'BTEX_fugitive', 'BTEX_stack', 'BTEX_total', 
	'n_5_1_fugitive_air_dioxin', 'n_5_2_stack_air_dioxin', 'dioxinTotal']

	for chemical in chemical_list:
		with open('data/Regression/'+chemical+'.csv', 'w') as f:
			# columns = ['Cancer', 'Coefficient', 'p-Value', 'Std. Error', 'Adj. R']
			# result_df = pd.DataFrame(index=columns)
			for cancer in cancer_list:
				mod = smf.ols(formula=cancer+' ~ '+chemical+' + \
					pctSmoking + pctElderly + income + higherEd + unemploy', data = data_merged).fit(cov_type='HC0')
				 
				result_df = pd.DataFrame({
					'Cancer': cancer,
					'Coefficient': mod.params.apply(lambda x: round(x, 3)),
		            'p-Value': mod.pvalues.apply(lambda x: round(x, 3)),
		            'Std. Error': mod.bse.astype(int),
		            'Adj. R': round(mod.rsquared_adj, 3)})

				# Coefficient for air emission feature is mod.params[1]
				
				if mod.params[1] > 1.0:
					print 'Cancer and pollutant combination with Coefficient > 1:'
					print mod.params[1], mod.pvalues[1], cancer, chemical  

				result_df.to_csv(f)

	# Test model
	mod = smf.ols(formula='observed_Esophagus_Per100k ~ n_5_1_fugitive_air_dioxin + \
	pctSmoking + pctElderly + income + higherEd + unemploy', data = data_merged).fit(cov_type='HC0')
	print mod.summary()
	# Correlation Table Heat Map
	hm(correlation_table)
	return data_merged