def main(): """Go through list of diseases, find charities for each one, scrape web for further information, and build database.""" # Cycle through diseases disease_list = master_disease_list.return_diseases() for disease in disease_list: clean_disease_name = ' '.join(disease.lower().replace('\'s disease','').split()) print "\nProcessing data for " + clean_disease_name + "...\n" # TO DO: allow synonymous names (e.g. colon vs colorectal cancer) # Scrape data bbb_soups = scrape_bbb_urls(clean_disease_name) bbb_charities = clean_bbb_info(bbb_soups,clean_disease_name) charities_with_links = get_charity_links(bbb_charities) charities_with_social = scrape_social_media(charities_with_links) char_with_twitter = get_twitter_followers(charities_with_social) char_with_nav_link = get_char_nav_info(char_with_twitter) # Convert to Pandas and then to SQl panda_char = pd.DataFrame(char_with_nav_link) if len(panda_char) > 0: panda_clean = clean_features(panda_char) sqlrows = convert_to_sql(panda_clean,clean_disease_name) # Print message to check that code is working and summarize data print "\nCharities focused on " + clean_disease_name + ": " + str(len(sqlrows)) + "\n" print "Done."
def main(): """Set up SQL table with distributions of features for faster conversion of user input.""" # Set up SQL connection mysqlauth = pd.DataFrame.from_csv('/home/kristy/Documents/auth_codes/mysql_user.csv') sqluser = mysqlauth.username[0] sqlpass = mysqlauth.password[0] con = mdb.connect(user=sqluser, host="localhost", db="charity_data", password=sqlpass, charset='utf8') # Set up a dataframe and get the list of diseases. combined_panda = pd.DataFrame(dtype=float) disease_list = master_disease_list.return_diseases() with con: # Read in SQL data for each disease. for disease in disease_list: clean_disease_name = '_'.join(disease.lower().replace('\'s disease','').replace('\'s','').split()) pandadf = pd.read_sql("SELECT cn_overall, cn_financial, cn_acct_transp, year_incorporated, age, twitter_followers, \ percent_admin, percent_fund, percent_program, \ total_contributions, total_expenses, total_revenue, staff_size, board_size FROM " + str(clean_disease_name), con) if len(pandadf) > 0: combined_panda = pd.concat([combined_panda, pandadf], 0, ignore_index=True) # Missing values were encoding as -1 in SQL. We want them to be NaNs now. for idx in range(len(combined_panda)): if combined_panda['year_incorporated'][idx] == -1.: combined_panda[idx:(idx+1)]['age'] = np.nan combined_panda[idx:(idx+1)]['year_incorporated'] = np.nan for col in combined_panda.columns: if combined_panda[col][idx] == -1: combined_panda[idx:(idx+1)][col] = np.nan # Make data frame with distributions of each feature. distribution = pd.DataFrame(columns=combined_panda.columns, index=['p0','p17','p25','p50','p75','p83','p100'], dtype='float') # Store specified percentages of each variable. for col in distribution.columns: quantiles = combined_panda[col].describe(percentile_width=50.) distribution.loc['p0',col] = quantiles['min'] distribution.loc['p25',col] = quantiles['25%'] distribution.loc['p50',col] = quantiles['50%'] distribution.loc['p75',col] = quantiles['75%'] distribution.loc['p100',col] = quantiles['max'] quantiles = combined_panda[col].describe(percentile_width=66.7) distribution.loc['p17',col] = quantiles['16.6%'] distribution.loc['p83',col] = quantiles['83.4%'] distribution['distidx'] = distribution.index # Save distribution table db = MySQLdb.connect(read_default_file='/home/kristy/Documents/auth_codes/my.cnf') db.query("CREATE DATABASE IF NOT EXISTS charity_data;") db.query("USE charity_data;") db.query("DROP TABLE IF EXISTS distribution;") distribution.to_sql(name = 'distribution', con = db, flavor = 'mysql') db.close()