def separateByRegion(CDC_Data, state_label, has_abbrev): new_CDC_data = CDC_Data if (has_abbrev == False): new_CDC_data[state_label] = new_CDC_data[state_label].apply(lambda x: abbreviate(x)) new_CDC_data['region'] = new_CDC_data[state_label].apply(lambda x: categorization(x)) return new_CDC_data else: new_CDC_data['region'] = new_CDC_data[state_label].apply(lambda x: categorization(x)) return new_CDC_data
def separateByRegion(CDC_Data): new_CDC_data = CDC_Data new_CDC_data['REGION'] = new_CDC_data['STATE_ABBR'].apply(lambda x: categorization(x)) return new_CDC_data
def separateByRegion(CDC_Data): new_CDC_data = CDC_Data new_CDC_data['Area'] = new_CDC_data['Area'].apply(lambda x: abbreviate(x)) new_CDC_data['region'] = new_CDC_data['Area'].apply( lambda x: categorization(x)) return new_CDC_data
def binRate(CDC_Data): new_CDC_data = CDC_Data def categorization(value): if value > 2.0: return 'very_high' elif value <= 2.0 and value > 1.0: return 'high' elif value <= 1.0 and value > -1.0: return 'medium' elif value <= -1.0 and value > -2.0: return 'low' else: return 'very low' new_CDC_data['Rate_categ'] = new_CDC_data['AgeAdjustedRate'].apply(lambda x: categorization(x)) return new_CDC_data
def binRate(CDC_Data, data_label): new_CDC_data = CDC_Data def categorization(value): if value > 2.0: return 'very high' elif value <= 2.0 and value > 1.0: return 'high' elif value <= 1.0 and value > -1.0: return 'medium' elif value <= -1.0 and value > -2.0: return 'low' else: return 'very low' #apply the binning and create the new columns new_CDC_data[str(data_label + "_bin")] = new_CDC_data[data_label].apply(lambda x: categorization(x)) return new_CDC_data
def main(): print("main") # Read in data directly into pandas cleaned_CDC_Data = pd.read_csv('USCS_CancerTrends_OverTime_ByState.csv', sep=',', encoding='latin1') #normalized_CDC_Data = normalizeCDC(cleaned_CDC_Data, ['AgeAdjustedRate']) #z score normalization normalized_CDC_Data = cleaned_CDC_Data normalized_CDC_Data.dropna(inplace=True) normalized_CDC_Data = separateByRegion(normalized_CDC_Data) binned_CDC_Data = binRate(normalized_CDC_Data) #pprint(binned_CDC_Data) year_start = 1999 year_end = 2017 pprint(binned_CDC_Data.columns) p_val_df = pd.DataFrame() p_val_df["STATE"] = binned_CDC_Data["Area"].unique() state_avg_cancer = [] for state in binned_CDC_Data["Area"].unique(): state_avg_cancer.append(binned_CDC_Data.loc[ binned_CDC_Data['Area'] == state]['AgeAdjustedRate'].mean()) #pprint(state_avg_cancer) p_val_df["AgeAdjustedRate"] = state_avg_cancer p_val_df["region"] = p_val_df["STATE"].apply(lambda x: categorization(x)) pprint(p_val_df) makeHeatMap_pval(p_val_df, "") #---------------------------------------------------------- mean_list = [] color_counter = 0 colors = ["green", "blue", "red", "black", "purple"] for each in binned_CDC_Data['region'].unique(): for yr in range(year_start, year_end, 1): mean_list.append( (binned_CDC_Data.loc[binned_CDC_Data['region'] == each].loc[ binned_CDC_Data['Year'] == yr])['AgeAdjustedRate'].mean()) #print(mean_list) lineGraphByRegion(list(range(year_start, year_end)), mean_list, colors[color_counter], each) mean_list = [] color_counter += 1 #plt.clf() plt.show() plt.clf() #heat map of linear regression correlations between each regions makeHeatMap_corr(binned_CDC_Data)
def main(): print("main") #read in the merged data set with the per capita chemical release estimates merged_data = pd.read_csv('merged_data2.csv', sep=',', encoding='latin1') #create a copy of the dataframe and add the timezone region labels new_data = merged_data.copy() region_data = separateByRegion(new_data) #drop all extra column except region, year, state, cancer and chemicals region_data = region_data.loc[:, region_data.columns.intersection([ 'YEAR', 'REGION', 'STATE_ABBR', 'AVG_REL_EST_TOTAL_PER_CAPITA', 'AGE_ADJUSTED_CANCER_RATE' ])] #drop rows in empty values region_data.dropna(inplace=True) pprint(len(region_data)) #save the cancer rate before normalization by z-score and rename cancer_series = region_data["AGE_ADJUSTED_CANCER_RATE"] cancer_series.rename("AGE_ADJUSTED_CANCER_RATE_ORIG", inplace=True) chemical_series = region_data["AVG_REL_EST_TOTAL_PER_CAPITA"] chemical_series.rename("AVG_REL_EST_TOTAL_PER_CAPITA_ORIG", inplace=True) #normalize the cancer and chemical release estimate values normalized_data = normalize_byQuestion(region_data, 'YEAR', 'AGE_ADJUSTED_CANCER_RATE') normalized_data = normalize_byQuestion(normalized_data, 'YEAR', 'AVG_REL_EST_TOTAL_PER_CAPITA') #add back in the original un-normalized chemical release and cancer rate data normalized_data = pd.concat( [normalized_data, cancer_series, chemical_series], axis=1) #rename columns to represent the values in them normalized_data.rename(columns={ "AGE_ADJUSTED_CANCER_RATE": "AGE_ADJUSTED_CANCER_RATE_Z", "AVG_REL_EST_TOTAL_PER_CAPITA": "AVG_REL_EST_TOTAL_Z" }, inplace=True) normalized_data.rename(columns={ "AGE_ADJUSTED_CANCER_RATE_ORIG": "AGE_ADJUSTED_CANCER_RATE", "AVG_REL_EST_TOTAL_PER_CAPITA_ORIG": "AVG_REL_EST_TOTAL_PER_CAPITA" }, inplace=True) #pprint(normalized_data) #create a dataframe for the network network_df = pd.DataFrame() #sort the data by state and then year normalized_data.sort_values(by=['STATE_ABBR', 'YEAR'], inplace=True) #bin the cancer rate and chemical release estimate by z-score and add columns for that binned_data = binRate(normalized_data, 'AVG_REL_EST_TOTAL_Z') #now columnname_bin binned_data = binRate(binned_data, 'AGE_ADJUSTED_CANCER_RATE_Z') #now columnname_bin pprint(binned_data) #for the network dataframe, get the unique state values and create a columns for that network_df['STATE_ABBR'] = normalized_data["STATE_ABBR"].unique( ) #alphabetical for states #add a column for the timezone region labels for state in network_df['STATE_ABBR']: network_df['REGION'] = network_df['STATE_ABBR'].apply( lambda x: categorization(x)) #pprint(network_df) #in the network dataframe, add columns for cancer and chemicals level by the most common value for that over the year for each state mode_chemical = [] mode_cancer = [] for state in network_df['STATE_ABBR'].unique(): mode_chemical.append( normalized_data[normalized_data['STATE_ABBR'] == state]['AVG_REL_EST_TOTAL_Z_bin'].mode().iat[0]) mode_cancer.append( normalized_data[normalized_data['STATE_ABBR'] == state] ['AGE_ADJUSTED_CANCER_RATE_Z_bin'].mode().iat[0]) network_df['CHEMICAL_LEVEL'] = mode_chemical network_df['CANCER_LEVEL'] = mode_cancer network_df.to_csv("network_df.csv") #pprint(network_df) #-------------------------------------- #create a final network dataframe which is essentially the edges final_network = pd.DataFrame(data=list( combinations(network_df.index.tolist(), 2)), columns=['Src', 'Dst']) #get all the state abbreviations and map them to the numerical number (index) that thehy correspond to states_df = pd.concat([network_df["STATE_ABBR"]], axis=1) states_dict = states_df.to_dict('index') new_states_dict = {} #dictionary counter = 0 for item in states_dict: new_states_dict[counter] = states_dict.get(item).get("STATE_ABBR") counter += 1 #get the list of weights for each edge final_weights = getWeights(final_network, network_df, normalized_data) #print(final_weights) #add a column for the edge weights final_network['Wght'] = final_weights #replace the state numbers with the state abbreviations final_network.replace(new_states_dict, inplace=True) pprint(final_network) #save the final network of edges and weights to a .csv file final_network.to_csv("final_network.csv", index=False) #has full connectivity