df1.P0010001 = df1.P0010001.astype(int)
df1[["NAME", "P0010001"]].sort("P0010001", ascending=True).head()

# <markdowncell>

# **Q23**: Describe the output of the following:
#
# ```Python
# df1.P0010001 = df1.P0010001.astype(int)
# df1[['NAME','P0010001']].sort('P0010001', ascending=True).head()
# ```

# <markdowncell>

# **A23**:
# A DataFrame (with 5 rows and 2 columns (NAME, P0010001)) listing the 5 least populous states in ascending order by population.

# <codecell>

df1.set_index("NAME", inplace=True)
df1.ix["Nebraska"]

# <markdowncell>

# **Q24**: After running:
#
# ```Python
#     df1.set_index('NAME', inplace=True)
# ```
#
# how would you access the Series for the state of Nebraska?
#
# 1. `df1['Nebraska']`
# 1. `df1[1]`
# 1. `df1.ix['Nebraska']`
# 1. `df1[df1['NAME'] == 'Nebraska']`

# <markdowncell>

# **A24**:
# <pre>
# 3
# </pre>

# <codecell>

len(states.STATES)

# <markdowncell>

# **Q25**. What is `len(states.STATES)`?

# <markdowncell>

# **A25**:
# <pre>
# 51
# </pre>

# <codecell>

len(df1[np.in1d(df1.state, [s.fips for s in states.STATES])])

# <markdowncell>

# **Q26**. What is
#
# ```Python
# len(df1[np.in1d(df1.state, [s.fips for s in states.STATES])])
# ```

# <markdowncell>

# **A26**:
# <pre>
# 51
# </pre>

# <markdowncell>

# In the next question, we will make use of the negation operator `~`.  Take a look at a specific example

# <codecell>

~Series([True, True, False, True])

# <codecell>

list(df1[~np.in1d(df1.state, [s.fips for s in states.STATES])].index)[0]

# <markdowncell>

# **Q27**. What is
#
# ```Python
#     list(df1[~np.in1d(df1.state, [s.fips for s in states.STATES])].index)[0]
# ```

# <markdowncell>

# **A27**:
# <pre>
# Puerto Rico
# </pre>

# <markdowncell>

# Consider `pop1` and `pop2`:

# <codecell>

pop1 = df1["P0010001"].astype("int").sum()
pop2 = df1[np.in1d(df1.state, [s.fips for s in states.STATES])]["P0010001"].astype("int").sum()

pop1 - pop2

# <markdowncell>

# **Q28**. What does `pop11 - pop2` represent?

# <markdowncell>

# **A28**:
# The population of Puerto Rico in the 2010 Census.

# <headingcell level=1>

# Generator and range

# <codecell>

sum(range(1, 101))

# <markdowncell>

# **Q29**. Given that
#
#     range(10)
#
# is
#
#     [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
#
# How to get the total of every integer from 1 to 100?
#
# 1. `sum(range(1, 101))`
# 1. `sum(range(100))`
# 1. `sum(range(1, 100))`
# 1. None of the above

# <markdowncell>

# **A29**:
# <pre>
# 1
# </pre>

# <codecell>

# itertools is a great library
# http://docs.python.org/2/library/itertools.html#itertools.count
# itertools.count(start=0, step=1):
# "Make an iterator that returns evenly spaced values starting with step."

from itertools import islice, count

c = count(0, 1)
print c.next()
print c.next()

# <markdowncell>

# **Q30**. What output is produced from
#
# ```Python
# # itertools is a great library
# # http://docs.python.org/2/library/itertools.html#itertools.count
# # itertools.count(start=0, step=1):
# # "Make an iterator that returns evenly spaced values starting with step."
#
# from itertools import islice, count
# c = count(0, 1)
# print c.next()
# print c.next()
# ```

# <markdowncell>

# **A30**:
# <pre>
# 0
# 1
# </pre>

# <codecell>

(2 * Series(np.arange(101))).sum()

# <markdowncell>

# **Q31**. Recalling that
#
#     1+2+3+...+100 = 5050
#
# what is:
#
# ```Python
# (2*Series(np.arange(101))).sum()
# ```

# <markdowncell>

# **A31**:
# <pre>
# 10100
# </pre>

# <headingcell level=1>

# Census Places

# <markdowncell>

# Consider the follow generator that we used to query for census places.

# <codecell>

import pandas as pd
from pandas import DataFrame

import census
import settings
import us

from itertools import islice

c = census.Census(settings.CENSUS_KEY)


def places(variables="NAME"):

    for state in us.states.STATES:
        geo = {"for": "place:*", "in": "state:{s_fips}".format(s_fips=state.fips)}
        for place in c.sf1.get(variables, geo=geo):
            yield place


# <markdowncell>

# Now we compute a DataFrame for the places: `places_df`

# <codecell>

r = list(islice(places("NAME,P0010001"), None))
places_df = DataFrame(r)
places_df.P0010001 = places_df.P0010001.astype("int")
def places(variables="NAME"):

    # placeholder generator
    # replace with your own code
    for k in []:
        yield k


# <codecell>

# use this code to run your code
# I recommend replacing the None in islice to a small number to make sure you're on
# the right track

r = list(islice(places("NAME,P0010001"), None))
places_df = DataFrame(r)
places_df.P0010001 = places_df.P0010001.astype("int")

places_df["FIPS"] = places_df.apply(lambda s: s["state"] + s["place"], axis=1)

print "number of places", len(places_df)
print "total pop", places_df.P0010001.sum()
places_df.head()

# <codecell>

# if you've done this correctly, the following asserts should stop complaining

assert places_df.P0010001.sum() == 228457238
# number of places in 2010 Census
assert len(places_df) == 29261
def states(variables="NAME"):
    geo = {"for": "state:*"}

    states_fips = set([state.fips for state in us.states.STATES])
    # need to filter out non-states
    for r in c.sf1.get(variables, geo=geo, year=2010):
        if r["state"] in states_fips:
            yield r


# <codecell>

# make a dataframe from the total populations of states in the 2010 Census

df = DataFrame(states("NAME,P0010001"))
df.P0010001 = df.P0010001.astype("int")
df.head()

# <codecell>

# check that that we have the right total population

df.P0010001.sum() == 308745538

# <codecell>

# add a column with the first letter
# we'll be grouping states based on the first letter of the state NAME

df["first_letter"] = df.NAME.apply(lambda s: s[0])
df.head()
# <codecell>

df2 = diversity(r)

# <codecell>

df2.sort_index(by='entropy5',ascending=False)

# <codecell>

msas_list = list(islice(msas('NAME,P0010001'),None))

# <codecell>

len(msas_list)

# <codecell>

df = DataFrame(msas_list)

# <codecell>

df.P0010001 = df.P0010001.astype('int')

# <codecell>

df.groupby('metropolitan statistical area/micropolitan statistical area').apply(lambda x:sum(x['P0010001']))

# <codecell>

type(r)
counties_df.P0010001 = counties_df.P0010001.astype("int")
counties_df.P0010001.sum()

# <markdowncell>

# One reason for writing all the counties in the form of a Python generator is tha you can easily control the number of counties we work with at any given time -- and then easily scaling out to get all of them.

# <codecell>

# make a list of the first ten counties

from itertools import islice

list(islice(counties2(), 10))

# <headingcell level=1>

# Generator for Census Tracts

# <markdowncell>

# The following generator loops through all the states to get at the individual counties to then get at the census tracts.

# <codecell>


def tracts(variables="NAME"):
    for state in us.states.STATES:

        # handy to print out state to monitor progress
        print state.fips, state
        counties_in_state = {"for": "county:*", "in": "state:{fips}".format(fips=state.fips)}

        for county in c.sf1.get("NAME", geo=counties_in_state):

            # print county['state'], county['NAME']
            tracts_in_county = {
                "for": "tract:*",
                "in": "state:{s_fips} county:{c_fips}".format(s_fips=state.fips, c_fips=county["county"]),
            }

            for tract in c.sf1.get(variables, geo=tracts_in_county):
                yield tract


# <codecell>

r = list(islice(tracts("NAME,P0010001"), 10))
tracts_df = DataFrame(r)
tracts_df.P0010001 = tracts_df.P0010001.astype("int")