Example #1
0
def gonzales(data, k):
    # transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:], index=data[:, 0])
    # adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    # choosing a random point as the first center

    # center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 = points_list.head(1)
    centers_list = DataFrame(center0.drop(["distance", "center"], axis=1))
    centers_list["color"] = "r"
    colors = "bgcmykw"
    # ===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    # ===========================================================================
    # looping k-1 time to have k centers
    for k_cycle in range(1, k + 1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0
        next_cluster = np.nan
        # loop on all the points to assign them to their closest center
        for indexp, p in points_list.iterrows():
            # variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0, 1]), p.as_matrix(columns=[0, 1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp

        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index])
        centers_list.set_value(next_cluster, "color", colors[k_cycle])
        # =======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        # =======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(["color"], axis=1, inplace=True)

    # ===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    # ===========================================================================

    # print(points_list)
    return centers_list.as_matrix(columns=[0, 1])
Example #2
0
import numpy as np
import sqlite3

# データフレームを作る
smp = {
    "state": ["Ohio", "Ohio", "Ohio", "Nebada", "Nebada"],
    "year": [2000, 2001, 2002, 2001, 2002],
    "pop": [1.5, 1.6, 1.7, 3.5, 4.3],
}
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame["year"]  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(smp, index=["one", "two", "three", "four", "five"])  # インデックスを追加
frame2.ix["one"]
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv("stock_px.csv")
print(data)
xlsx_file = pd.ExcelFile("stock_px.xlsx")  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse("stock_px")
print(data)

# web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
ds = np.DataSource(None)
Example #3
0
time_series = {}
for code, d in zip(codes, data):
    d.index = d.DATE
    time_series[code] = d.VALUE
merged_data = DataFrame(time_series)
# Unequal length series
print(merged_data)

term_premium = merged_data["GS10"] - merged_data["GS1"]
term_premium.name = "Term"
merged_data = merged_data.join(term_premium, how="outer")
default_premium = merged_data["BAA"] - merged_data["AAA"]
default_premium.name = "Default"
merged_data = merged_data.join(default_premium, how="outer")
merged_data = merged_data.drop(["AAA", "BAA", "GS10", "GS1"], axis=1)
print(merged_data.tail())

quarterly = merged_data.dropna()
print(quarterly.tail())

growth_rates_selector = ["GDPC1", "INDPRO", "CPILFESL"]
growth_rates = quarterly[growth_rates_selector].pct_change()
final = quarterly.drop(growth_rates_selector, axis=1).join(growth_rates)

new_names = {"GDPC1": "GDP_growth", "INDPRO": "IP_growth", "CPILFESL": "Inflation", "UNRATE": "Unemp_rate"}
final = final.rename(columns=new_names).dropna()
final.to_hdf("FRED_data.h5", "FRED", complevel=6, complib="zlib")
final.to_excel("FRED_data.xlsx")

ax = final[["GDP_growth", "IP_growth", "Unemp_rate"]].plot(subplots=True)
fig = ax[0].get_figure()
Example #4
0
dictSeries=Series(myDict)
dictSeries

#Creating a data frame from dictionary
empDict={'id':[1,2,3,4],'name':   ['Mark','Ian','Sam','Rich'],'isManager':[False,True,False,True]}

## Data Structure : Data Frame from a dictionary
empDict={'id':[1,2,3,4]}
empDf=DataFrame(empDict)

#Access rows and columns 
empDf.name
empDf.name[2]
empDf[empDf.isManager == False]
empDf.head()
empDf.tail()
empDf.iloc[2,]

#Create new column
empDf.append(Series([5,False,'Derek',2],
                    index=['id','isManager','name','deptId'],
ignore_index=True)
empDf

#Deleting a column
empDf['dummy']=1
empDf
del empDf['dummy']
empDf

#Deleting a row
site_df.tail(2)


# In[21]:

t1["special_site"] = 0
t2["special_site"] = 0
test["special_site"] = 0
t1["special_site"][t1["site_category"] == "dedf689d"] = 1
t2["special_site"][t2["site_category"] == "dedf689d"] = 1
test["special_site"][test["site_category"] == "dedf689d"] = 1


# In[22]:

print sum(t1["special_site"]), sum(t2["special_site"]), sum(test["special_site"])


# In[23]:

feature_cols = ["special_site"]


# In[24]:

validate(feature_cols)


# In[25]:

sorted_plot("C1", train, 1)


# In[32]:

c1 = train["C1"].unique()
avg_c1 = np.array([])
for i in c1:
    avg_c1 = np.append(avg_c1, np.mean(train["click"][train["C1"] == i]))


# In[33]:

df_c1 = DataFrame({"c1": c1, "avg_click": avg_c1})


# In[34]:

plt.plot(df_c1["avg_click"])


# In[35]:

df_c1 = df_c1.sort(columns="avg_click")
plt.plot(range(len(df_c1)), df_c1["avg_click"], "bo", range(len(df_c1)), df_c1["avg_click"], "k")


# In[36]:

df_c1


# In[37]:

t1["C1"][0]


# In[40]:

l1 = [1001, 1007]
l2 = [1010, 1008]
l3 = [1005, 1012]
t1["c1_1"] = 0
t2["c1_1"] = 0
test["c1_1"] = 0
t1["c1_2"] = 0
t2["c1_2"] = 0
test["c1_2"] = 0
t1["c1_3"] = 0
t2["c1_3"] = 0
test["c1_3"] = 0
for k in l1:
    t1["c1_1"][t1["C1"] == k] = 1
    t2["c1_1"][t2["C1"] == k] = 1
    test["c1_1"][test["C1"] == k] = 1
for k in l2:
    t1["c1_2"][t1["C1"] == k] = 1
    t2["c1_2"][t2["C1"] == k] = 1
    test["c1_2"][test["C1"] == k] = 1
for k in l3:
    t1["c1_3"][t1["C1"] == k] = 1
    t2["c1_3"][t2["C1"] == k] = 1
    test["c1_3"][test["C1"] == k] = 1


# In[41]:

l = ["c1_1", "c1_2", "c1_3"]


# In[42]:

validation_check(feature_cols, l)


# In[49]:

print sum(test["c1_1"]), sum(test["c1_2"]), sum(test["c1_3"])


# In[45]:

feature_cols


# In[50]:

# Now lets try to check the C1 variable with 6 dummy variables


# In[52]:

c1 = t1["C1"].unique()
for i in range(1, 7, 1):
    pair = "c1", format(i)
    str1 = "".join(pair)
    t1[str1] = 0
    t2[str1] = 0
    test[str1] = 0
    t1[str1][t1["C1"] == c1[i - 1]] = 1
    t2[str1][t2["C1"] == c1[i - 1]] = 1
    test[str1][test["C1"] == c1[i - 1]] = 1


# In[54]:

print sum(test["c11"]), sum(test["c12"]), sum(test["c13"]), sum(test["c14"]), sum(test["c15"]), sum(test["c16"])


# In[55]:

test["C1"].value_counts()


# In[56]:

train["C1"].value_counts()


# In[57]:

# 1008 wont make a difference in case of the test data


# In[58]:

c1


# In[61]:

c1 = train["C1"].value_counts()
c1 = c1.index
c1


# In[62]:

for i in range(1, 7, 1):
    pair = "c1", format(i)
    str1 = "".join(pair)
    t1[str1] = 0
    t2[str1] = 0
    test[str1] = 0
    t1[str1][t1["C1"] == c1[i - 1]] = 1
    t2[str1][t2["C1"] == c1[i - 1]] = 1
    test[str1][test["C1"] == c1[i - 1]] = 1


# In[63]:

print sum(test["c11"]), sum(test["c12"]), sum(test["c13"]), sum(test["c14"]), sum(test["c15"]), sum(test["c16"])


# In[64]:

l_c1 = ["c11", "c12", "c13", "c14", "c15", "c16"]


# In[65]:

validation_check(feature_cols, l_c1)


# In[66]:

feature_cols


# In[68]:

validate(l_c1)


### LEFT C1

# In[69]:

t1.columns


# In[71]:

train["banner_pos"].value_counts()


# In[72]:

test["banner_pos"].value_counts()


###### There is no banner position called 5 in the test data

# In[115]:

sorted_plot("banner_pos", train, 1)


# In[107]:

t1["banner_high"] = 0
t2["banner_high"] = 0
test["banner_high"] = 0
t1["banner_high"][t1["banner_pos"] == 3] = 1
t2["banner_high"][t2["banner_pos"] == 3] = 1
test["banner_high"][test["banner_pos"] == 3] = 1


# In[108]:

l_banner_high = ["banner_high"]


# In[109]:

validate(l_banner_high)


# In[110]:

validation_check(feature_cols, l_banner_high)


# In[111]:

# Making banner bins


# In[112]:

test["banner_pos"].value_counts()


# In[114]:

train["banner_pos"].value_counts()


# In[116]:

l_banner_low = ["banner_low"]


# In[118]:

t1["banner_low"] = 0
t2["banner_low"] = 0
test["banner_low"] = 0
t1["banner_low"][(t1["banner_pos"] == 1) | (t1["banner_pos"] == 0)] = 1
t2["banner_low"][(t2["banner_pos"] == 1) | (t2["banner_pos"] == 0)] = 1
test["banner_low"][(test["banner_pos"] == 1) | (test["banner_pos"] == 0)] = 1


# In[123]:

validation_check(feature_cols, (l_banner_low + l_banner_high))


# In[124]:

t1.columns


# In[125]:

sorted_plot("h", t1, 1)


# In[126]:

# all are between 1 and 2 not much difference , This could be put as a linear variable by changing the values


# In[127]:

h_ascend = [19, 6, 9, 1, 14, 12, 15, 13, 11, 17, 10, 7, 4, 0, 8, 2, 21, 3, 5, 16, 18, 20, 23, 22]


# In[128]:

len(h_ascend)


# In[ ]:


# In[154]:

h_df = DataFrame({"rank": range(1, 25, 1), "h": h_ascend})
A_df = h_dict[h_dict["h"] == 14]
A_df.iloc[0]["rank"]


# In[155]:

h_df = DataFrame({"rank": range(1, 25, 1), "h": h_ascend})
t1["h_rank"] = 0
t2["h_rank"] = 0
test["h_rank"] = 0
for i in range(len(t1)):
    A_df = h_df[h_df["h"] == t1["h"][i]]
    t1["h_rank"][i] = A_df.iloc[0]["rank"]


# In[156]:

h_df = DataFrame({"rank": range(1, 25, 1), "h": h_ascend})
for i in range(len(t2)):
    A_df = h_df[h_df["h"] == t2["h"][i]]
    t2["h_rank"][i] = A_df.iloc[0]["rank"]


# In[157]:

for i in range(len(test)):
    A_df = h_df[h_df["h"] == test["h"][i]]
    test["h_rank"][i] = A_df.iloc[0]["rank"]


# In[162]:

validate(["h_rank"] + feature_cols)


# In[160]:

print feature_cols


# In[163]:

validation_check(feature_cols, ["h_rank"])


# In[164]:

t2["click"].value_counts()


# In[176]:


def validate2(f):
    model = LogisticRegression()
    model = model.fit(t1[f], t1["click"])
    probs = model.predict_proba(t2[f])
    predictY = DataFrame(probs)
    p = predictY[1]
    s = 0
    for i in range(len(t2)):
        s = s + abs((t2["click"][i]) - p[i])
    print s


# In[177]:

validate2(feature_cols)


# In[182]:

validate2(["banner_low"])


# In[183]:

validate2(["banner_high"])


# In[184]:

validate2(l_c1)


# In[185]:

c1


###### l_c1 seems to be a better estimator than feature_cols

# In[186]:

print feature_cols


# In[187]:

validate2(l_c1 + feature_cols)


# In[190]:

feature_cols = ["special_site"] + l_c1


# In[191]:

print feature_cols


# In[192]:

t1.columns


# In[201]:


def validation_check2(f, l):
    validate2(f)
    print "-----"
    validate2(l)
    print "-----"
    validate2(l + f)


# In[194]:

print feature_cols


# In[195]:

sorted_plot("device_conn_type", train, 1)


# In[196]:

conn = train["device_conn_type"].unique()
conn


# In[198]:

test["device_conn_type"].value_counts()


# In[199]:

for i in range(1, 4, 1):
    pair = "conn_type", format(i)
    str1 = "".join(pair)
    t1[str1] = 0
    t2[str1] = 0
    test[str1] = 0
    t1[str1][t1["device_conn_type"] == conn[i - 1]] = 1
    t2[str1][t2["device_conn_type"] == conn[i - 1]] = 1
    test[str1][test["device_conn_type"] == conn[i - 1]] = 1


# In[202]:

validation_check2(feature_cols, ["conn_type1", "conn_type2", "conn_type3"])


# In[203]:

t1.columns


# In[206]:

test["C18"].value_counts()


# In[207]:

sorted_plot("C18", train, 1)


# In[208]:

c18 = train["C18"].unique()
for i in range(1, 4, 1):
    pair = "c18", format(i)
    str1 = "_".join(pair)
    t1[str1] = 0
    t2[str1] = 0
    test[str1] = 0
    t1[str1][t1["C18"] == c18[i - 1]] = 1
    t2[str1][t2["C18"] == c18[i - 1]] = 1
    test[str1][test["C18"] == c18[i - 1]] = 1


# In[209]:

feature_cols = feature_cols + ["conn_type1", "conn_type2", "conn_type3"]


# In[210]:

validation_check2(feature_cols, ["c18_1", "c18_2", "c18_3"])


# In[211]:

feature_cols = feature_cols + ["c18_1", "c18_2", "c18_3"]


# In[212]:

print feature_cols


# In[213]:

t1.columns


# In[218]:

sorted_plot("weekday", t1, 1)


# In[226]:

test["C15"].value_counts()


# In[228]:

train["C15"].value_counts()


# In[230]:

sorted_plot("C15", train, 1)


# In[231]:

c15 = train["C15"].unique()
for i in range(1, 8, 1):
    pair = "c15_", format(i)
    str1 = "".join(pair)
    t1[str1] = 0
    t2[str1] = 0
    test[str1] = 0
    t1[str1][t1["C15"] == c15[i - 1]] = 1
    t2[str1][t2["C15"] == c15[i - 1]] = 1
    test[str1][test["C15"] == c15[i - 1]] = 1


# In[233]:

l15 = []
for i in range(1, 8, 1):
    pair = "c15_", format(i)
    str1 = "".join(pair)
    l15 = l15 + [str1]
l15


# In[234]:

validation_check2(feature_cols, l15)


# In[235]:

feature_cols = feature_cols + l15


# In[236]:

print feature_cols


# In[237]:

test["C16"].value_counts()


# In[240]:

train["C16"].value_counts()


# In[241]:

c16 = train["C16"].unique()
for i in range(1, 9, 1):
    pair = "c16_", format(i)
    str1 = "".join(pair)
    t1[str1] = 0
    t2[str1] = 0
    test[str1] = 0
    t1[str1][t1["C16"] == c16[i - 1]] = 1
    t2[str1][t2["C16"] == c16[i - 1]] = 1
    test[str1][test["C16"] == c16[i - 1]] = 1


# In[242]:

l = []
for i in range(1, 9, 1):
    pair = "c16_", format(i)
    str1 = "".join(pair)
    l = l + [str1]


# In[243]:

validation_check2(feature_cols, l)


# In[244]:

feature_cols = feature_cols + l


# In[245]:

print feature_cols


# In[251]:

c21 = train["C21"].value_counts()
c21 = c21[c21 > 30]
c21 = c21.index


# In[252]:

len(c21)


# In[253]:

avg_c21 = np.array([])
for i in range(len(c21)):
    avg_c21 = np.append(avg_c21, np.mean(train["click"][train["C21"] == c21[i]]))


# In[254]:

plt.plot(avg_c21)


# In[255]:

df_c21 = DataFrame({"c21": c21, "Avg": avg_c21})


# In[256]:

df_c21 = df_c21.sort(columns="Avg")


# In[257]:

df_c21.head(2)


# In[258]:

plt.plot(df_c21["Avg"])


# In[259]:

print df_c21["Avg"].max(), df_c21["Avg"].min()


# In[261]:

df_c21.tail(2)
Example #6
0
df.to_csv("births1880.txt", index=False, header=False)

Location = r"births1880.txt"

df = read_csv(Location)

print df

print df.head()

df = read_csv(Location, header=None)

print df

print df.tail()

df = read_csv(Location, names=["Names", "Births"])

print df.head()

import os

os.remove(Location)

print df["Names"].unique()

for x in df["Names"].unique():
    print x

print df["Names"].describe()
def GetFollowsByCode_InFiles(filelist, code="SH600036"):
    global codemarket
    # print filelist
    code = CodeName_process(code)
    print "code:", code
    name, follows_list = GetFollows_InFiles(filelist, code)
    print name.decode("gbk")
    csvfilename = get_stock_history_csv(code, name.decode("gbk"))
    print csvfilename
    if csvfilename == "":
        print "csv file not found. exit."
        return
    # print 'follows_list:', follows_list
    follows_chg_list = GetFollows_ProcessList(follows_list, csvfilename)
    xdata = zip(*follows_chg_list)[0]  # get DataFrame from List
    df = DataFrame(follows_chg_list, index=xdata, columns=["DATE", "CHG", "CHG_PCT", "PRICE", "VOLUME"])
    # print df
    print df.tail(20)
    # print len(df)
    # print df.CHG.describe()
    CHG_mean = df.CHG.mean()
    print "CHG_mean", CHG_mean
    # print [CHG_mean for x in range(10)]
    # return  #####
    # fig = plt.figure(figsize=(16,9))
    # fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(16,9))
    fig = plt.figure(figsize=(16, 8.5))
    ax0 = fig.add_axes((0.1, 0.2, 0.8, 0.7))  # [left, bottom, width, height]

    # ax_left = ax0
    ax_left = df.CHG.plot(ax=ax0, kind="bar", alpha=0.5, align="center", linewidth=2)
    ax0.plot([CHG_mean for x in range(len(df))], "g--", linewidth=2)
    ax_left.set_ylabel("f")
    ax_right = df.PRICE.plot(ax=ax0, secondary_y=True, color="red", marker="v", linewidth=2, alpha=0.7)
    ax_right.set_ylabel("price")

    if codemarket == 0:
        value_str = GetStockInfo_fromFile(csv.reader(file("stockinfo_cn.csv", "rb")), code).decode("gbk")
        plt.title(name.decode("gbk") + code + " v" + value_str)
    else:
        plt.title(name.decode("gbk") + code)
    plt.xlabel("Date")
    # print type(plt.xlim())
    # print type(xdata), xdata, xdata[0]
    list, listlabel = GetXticksList(xdata)
    ax_left.set_xticks(list)
    ax_left.set_xticklabels([])  # (listlabel, fontsize='small')
    # plt.legend()
    # fig.autofmt_xdate()
    # ax1.set_title('volume')
    # plt.subplot(223, axisbg='r')
    ax1 = fig.add_axes((0.1, 0.05, 0.8, 0.15), sharex=ax0)

    ax_volume = df.VOLUME.plot(ax=ax1, kind="bar", color="green", linewidth=1, alpha=0.7)
    ax_volume.set_xticklabels([])
    ax_volume.set_xticklabels(listlabel, fontsize="small")
    ax_volume.set_xticks(list)
    ax_volume.set_ylabel("volume")
    ax1.plot([df.VOLUME.mean() for x in range(len(df))], "g--", linewidth=2)

    # fig.subplots_adjust(bottom=0.8)
    # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
    # fig.colorbar(im, cax=cbar_ax)

    if not savepng:
        plt.show()
    else:
        fig.savefig(save_fname)  # , dpi=140)
from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M']/(ambi_names_pt[name_]['F'] + ambi_names_pt[name_]['M']))  \
     for name_ in names_to_calc]
p_m_instant = DataFrame(dict(m))
p_m_instant.tail()

# <codecell>

# similar calculation except instead of looking at the proportions for a given year only,
# we look at the cumulative number of male/female babies for given name

from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M'].cumsum()/(ambi_names_pt[name_]['F'].cumsum() + ambi_names_pt[name_]['M'].cumsum()))  \
     for name_ in names_to_calc]
p_m_cum = DataFrame(dict(m))
p_m_cum.tail()

# <codecell>

p_m_cum['Donnie'].plot()

# <codecell>

# some metrics that attempt to measure how a time series s has changed
# zscore normalization for petal length
# standard deviation taken from description of the data
def zScoreNorm(num):

    return (num - m) / 1.76


iris_data_c = iris_data
iris_data_c["Petal Length"] = iris_data_c["Petal Length"].apply(zScoreNorm)
norm_zscore_data = iris_data_c["Petal Length"]
print "norm_data"
print norm_zscore_data

print iris_data
print iris_target

iris_target["Species"] = iris_target["Species"].apply(flower_type)
print iris_target.head()
print iris_target.tail()


iris = pd.concat([iris_data, iris_target], axis=1)
print iris


sns.pairplot(iris, hue="Species", size=2)
sns.plt.show()


sns.factorplot("Petal Length", data=iris, hue="Species", size=8, kind="count")
sns.plt.show()
Example #10
0
# In[195]:

KM = [kmeans(X, k) for k in K]
print type(KM), len(KM)


# In[196]:

KM_df = DataFrame(KM)
print KM_df.head(1)


# In[197]:

print KM_df.tail(1)


# In[198]:

KM_df.shape


# In[199]:

KM_v1 = KM_df[0]
print type(KM_v1)


# In[200]: