def slide_17():
    df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
    print pd.get_dummies(df['key'])

    dummies = pd.get_dummies(df['key'], prefix='key')
    print dummies
    df_with_dummy = df[['data1']].join(dummies)
    print df_with_dummy

    mnames = ['movie_id', 'title', 'genres']
    movies = pd.read_table(MOVIELENSPATH,
                           sep='::',
                           header=None,
                           engine='python',
                           names=mnames)
    print movies[:10]

    genre_iter = (set(x.split('|')) for x in movies.genres)
    genres = sorted(set.union(*genre_iter))
    print genres
    dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)

    for i, gen in enumerate(movies.genres):
        dummies.ix[i, gen.split('|')] = 1

    movies_windic = movies.join(dummies.add_prefix('Genre_'))
    print movies_windic.ix[0]

    values = np.random.rand(10)
    print values
    bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

    print pd.get_dummies(pd.cut(values, bins))
def dummy02():
    mnames=['movies_id','title','genres']
    movies=pd.read_table(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch02\movielens\movies.dat',
                         sep='::',header=None,names=mnames)
    print movies[:10]
    genre_iter=(set(x.split('|')) for x in movies.genres)
    genres=sorted(set.union(*genre_iter))
    print genres
    dummies=DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
    for i,gen in enumerate(movies.genres):
        dummies.ix[i,gen.split('|')]=1
    movies_windic=movies.join(dummies.add_prefix('Genre_'))
    print movies_windic.ix[0]
all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                        index=data.index, columns=code_index)

# <codecell>

dummy_frame.ix[:, :6]

# <codecell>

for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1

data = data.join(dummy_frame.add_prefix('category_'))

# <codecell>

data.CATEGORY.isnull().value_counts()

# <codecell>

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,
                    lllon=-75, urlon=-71):
    # create polar stereographic Basemap instance.
    m = Basemap(ax=ax, projection='stere',
                lon_0=(urlon + lllon) / 2,
Example #4
0
print pd.get_dummies(df['key'])

# 给DataFrame的列加上前缀,方便合并
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
print df_with_dummy

# 某一行同属于多个分类
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames)
print movies[:10]
# 数据规整
genre_iter = (set(x.split('|')) for x in movies.genres)
# 抽取出不同的值
genres = sorted(set.union(* genre_iter))
print genre_iter
print genres

# 构建全0的DataFrame
dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1
movies_windic = movies.join(dummies.add_prefix('Genre_'))
print movies_windic.ix[0]

# 结合get_dummies和诸如cut之类离散化函数
values = np.random.rand(10)
print values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
print pd.get_dummies(pd.cut(values, bins))
Example #5
0
# Read data from standard input on the command line
sys.stdin = os.fdopen(sys.stdin.fileno(), "rU")
data = pd.read_csv(sys.stdin)

# Restrict to data in Haiti with categories
data = data[
    (data.LATITUDE > 18)
    & (data.LATITUDE < 20)
    & (data.LONGITUDE > -75)
    & (data.LONGITUDE < -70)
    & data.CATEGORY.notnull()
]

# Extract categorizations
all_cats = get_all_categories(data.CATEGORY)

# Add indicator columns for categories
all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index)

for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1

data = data.join(dummy_frame.add_prefix("category_"))

# Write data to standard output
data.to_csv(sys.stdout)
def slide_14():
    data = pd.read_csv(HAICHICSVPATH)
    print data

    print data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]

    print 'データのカテゴリ'
    print data['CATEGORY'][:6]

    print 'データの詳細'
    print data.describe()
    print '外れたところのデータと欠損値を外す'
    data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
                (data.LONGITUDE > -75) & (data.LONGITUDE < -70)
                & (data.CATEGORY.notnull())]

    def to_cat_list(catstr):
        stripped = (x.strip() for x in catstr.split(','))
        return [x for x in stripped if x]

    def get_all_categories(cat_series):
        cat_sets = (set(to_cat_list(x)) for x in cat_series)
        return sorted(set.union(*cat_sets))

    def get_english(cat):
        code, names = cat.split('.')
        if '|' in names:
            names = names.split(' | ')[1]
        return code, names.strip()

    all_cats = get_all_categories(data.CATEGORY)
    english_mapping = dict(get_english(x) for x in all_cats)

    print english_mapping['2a']
    print english_mapping['6c']

    def get_code(seq):
        return [x.split('.')[0] for x in seq if x]

    all_codes = get_code(all_cats)
    code_index = pd.Index(np.unique(all_codes))
    dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                            index=data.index, columns=code_index)
    print dummy_frame.ix[:, :6]

    print data.index
    for row, cat in zip(data.index, data.CATEGORY):
        codes = get_code(to_cat_list(cat))
        dummy_frame.ix[row, codes] = 1
    data = data.join(dummy_frame.add_prefix('category_'))
    print data.ix[:, 10:15]

    from mpl_toolkits.basemap import Basemap

    def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,
                                 lllon=-75, urlon=-71):
        m = Basemap(ax=ax,
                    projection='stere',
                    lon_0=(urlon + lllon) / 2,
                    lat_0=(urlat + lllat) / 2,
                    llcrnrlat=lllat,
                    urcrnrlat=urlat,
                    llcrnrlon=lllon,
                    urcrnrlon=urlon,
                    resolution='f')
        m.drawcoastlines()
        m.drawstates()
        m.drawcountries()
        return m

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
    to_plot = ['2a', '1', '3c', '7a']
    lllat = 17.25
    urlat = 20.25
    lllon = -75
    urlon = -71

    for code, ax in zip(to_plot, axes.flat):
        m = basic_haiti_map(ax,
                            lllat=lllat,
                            urlat=urlat,
                            lllon=lllon,
                            urlon=urlon)
        cat_data = data[data['category_%s' % code] == 1]

        x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)
        m.plot(x, y, 'k.', alpha=0.5)
        ax.set_title('%s: %s' % (code, english_mapping[code]))

    m.readshapefile(SHAPEFILEPATH, 'roads')