def main():
    """Go through list of diseases, find charities for each one, scrape web for further information, and build database."""

    # Cycle through diseases
    disease_list = master_disease_list.return_diseases()

    for disease in disease_list:
        clean_disease_name = ' '.join(disease.lower().replace('\'s disease','').split())
        print "\nProcessing data for " + clean_disease_name + "...\n"
        # TO DO: allow synonymous names (e.g. colon vs colorectal cancer)
        
        # Scrape data
        bbb_soups = scrape_bbb_urls(clean_disease_name)
        bbb_charities = clean_bbb_info(bbb_soups,clean_disease_name)
        charities_with_links = get_charity_links(bbb_charities)
        charities_with_social = scrape_social_media(charities_with_links)
        char_with_twitter = get_twitter_followers(charities_with_social)
        char_with_nav_link = get_char_nav_info(char_with_twitter)
        
        # Convert to Pandas and then to SQl
        panda_char = pd.DataFrame(char_with_nav_link)
        if len(panda_char) > 0:
            panda_clean = clean_features(panda_char)
            sqlrows = convert_to_sql(panda_clean,clean_disease_name)
            
            # Print message to check that code is working and summarize data
            print "\nCharities focused on " + clean_disease_name + ": " + str(len(sqlrows)) + "\n"
    print "Done."
def main():
    """Set up SQL table with distributions of features for faster conversion of user input."""

    # Set up SQL connection
    mysqlauth = pd.DataFrame.from_csv('/home/kristy/Documents/auth_codes/mysql_user.csv')
    sqluser = mysqlauth.username[0]
    sqlpass = mysqlauth.password[0]
    
    con = mdb.connect(user=sqluser, host="localhost", db="charity_data", password=sqlpass, charset='utf8')
    
    # Set up a dataframe and get the list of diseases.
    combined_panda = pd.DataFrame(dtype=float)
    disease_list = master_disease_list.return_diseases()

    with con:
        # Read in SQL data for each disease.
        for disease in disease_list:
            clean_disease_name = '_'.join(disease.lower().replace('\'s disease','').replace('\'s','').split())
            pandadf = pd.read_sql("SELECT cn_overall, cn_financial, cn_acct_transp, year_incorporated, age, twitter_followers, \
                percent_admin, percent_fund, percent_program, \
                total_contributions, total_expenses, total_revenue, staff_size, board_size FROM " + str(clean_disease_name), con)
            if len(pandadf) > 0:
                combined_panda = pd.concat([combined_panda, pandadf], 0, ignore_index=True)

    # Missing values were encoding as -1 in SQL. We want them to be NaNs now.
    for idx in range(len(combined_panda)):
        if combined_panda['year_incorporated'][idx] == -1.:
            combined_panda[idx:(idx+1)]['age'] = np.nan
            combined_panda[idx:(idx+1)]['year_incorporated'] = np.nan
        for col in combined_panda.columns:
            if combined_panda[col][idx] == -1:
                combined_panda[idx:(idx+1)][col] = np.nan

    # Make data frame with distributions of each feature.
    distribution = pd.DataFrame(columns=combined_panda.columns, index=['p0','p17','p25','p50','p75','p83','p100'], dtype='float')

    # Store specified percentages of each variable.
    for col in distribution.columns:
        quantiles = combined_panda[col].describe(percentile_width=50.)
        distribution.loc['p0',col] = quantiles['min']
        distribution.loc['p25',col] = quantiles['25%']
        distribution.loc['p50',col] = quantiles['50%']
        distribution.loc['p75',col] = quantiles['75%']
        distribution.loc['p100',col] = quantiles['max']
        
        quantiles = combined_panda[col].describe(percentile_width=66.7)
        distribution.loc['p17',col] = quantiles['16.6%']
        distribution.loc['p83',col] = quantiles['83.4%']
    distribution['distidx'] = distribution.index 

    # Save distribution table
    db = MySQLdb.connect(read_default_file='/home/kristy/Documents/auth_codes/my.cnf')
    db.query("CREATE DATABASE IF NOT EXISTS charity_data;")
    db.query("USE charity_data;")
    db.query("DROP TABLE IF EXISTS distribution;")
    distribution.to_sql(name = 'distribution', con = db, flavor = 'mysql')
    db.close()