-
Notifications
You must be signed in to change notification settings - Fork 0
/
est_cost.py
100 lines (82 loc) · 2.56 KB
/
est_cost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from matplotlib import use; use('Agg')
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from sklearn.mixture import GMM
from sklearn.neighbors import KernelDensity
def fit_gmm(costs, k=3, min_samps=50):
"""
Fit return a scikit GMM model.
"""
if costs.size < min_samps:
raise Exception('Not enough samps')
gmm = GMM(k)
return gmm.fit(costs)
def fit_kde(costs, frac_std):
"""
Fit a KDE to the costs, use a gaussian kernel and a bandwidth that is the
specified fraction of the std.
"""
bw = frac_std * np.std(costs)
kde = KernelDensity(bandwidth=bw)
return kde.fit(costs)
def select_conditionals(df, features):
"""
Return the costs associated with the selected features.
"""
costs = np.array(df['Total Charges'])
keys = features.keys()
ind = np.zeros(df.shape[0], np.bool)
print features
print ind.shape
print keys
#arr = np.array(df[keys[0]])
print arr.shape
for i in range(len(keys)):
arr = np.array(df[keys[i]], type(features[keys[i]]))
ind = ind & (arr == features[keys[i]])
print arr[ind].shape
print costs.shape
print costs[ind].shape
assert 0
return costs[ind]
def plot_gmm(train, test, gmm):
"""
Plot the train and test data and the empirical estimate.
"""
rng = [0, 0]
rng[0] = 0.95 * np.minimum(train.min(), test.min())
rng[1] = 1.05 * np.maximum(train.max(), train.max())
assert 0
def plot_gmm(train, test, kde, nsamps, fs=5, filename='./plots/kde.png'):
"""
Plot the train and test data and the empirical estimate.
"""
rng = [0, 0]
rng[0] = 0.95 * np.minimum(train.min(), test.min())
rng[1] = 1.05 * np.maximum(train.max(), train.max())
samps = kde.sample(nsamp)
scores = kde.score(samps)
fig = pl.figure(figsize=(2 * fs, fs))
pl.subplot(121)
pl.hist(train, bins=np.min(100, 1. * train.size / 10), normed=True,
color='k', alpha=0.4)
pl.plot(samps, scores, 'r', lw=2)
pl.subplot(121)
pl.hist(train, bins=np.min(100, 1. * train.size / 10), normed=True,
color='k', alpha=0.4)
pl.plot(samps, scores, 'r', lw=2)
fig.savefig(filename)
def get_rand_features():
assert 0
if __name__ == '__main__':
seed = 234
np.random.seed(seed)
df = pd.read_csv('./data/sparcs_cleaned.csv')
keys = list(df.columns.values)
ind = np.random.randint(len(df.shape[0]))
features = {}
for k in keys:
features[k] = df[k][ind]
features.pop('Unnamed: 0')
features.pop('Total Charges')