Example #1
0
def __diff(data: pd.DataFrame) -> pd.DataFrame:
    """
    Differentiate columns. The first row gets lost.
    """
    data['dmasl'] = data['masl'].diff()
    data['dlat'] = data['lat'].diff()
    data['dlon'] = data['lon'].diff()
    return data.tail(-1)
Example #2
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
Example #3
0
time_series = {}
for code, d in zip(codes,data):
    d.index = d.DATE
    time_series[code] = d.VALUE
merged_data = DataFrame(time_series)
# Unequal length series
print(merged_data)

term_premium = merged_data['GS10'] - merged_data['GS1']
term_premium.name = 'Term'
merged_data = merged_data.join(term_premium,how='outer')
default_premium = merged_data['BAA'] - merged_data['AAA']
default_premium.name = 'Default'
merged_data = merged_data.join(default_premium,how='outer')
merged_data = merged_data.drop(['AAA','BAA','GS10','GS1'],axis=1)
print(merged_data.tail())

quarterly = merged_data.dropna()
print(quarterly.tail())

growth_rates_selector = ['GDPC1','INDPRO','CPILFESL']
growth_rates = quarterly[growth_rates_selector].pct_change()
final = quarterly.drop(growth_rates_selector, axis=1).join(growth_rates)

new_names = {'GDPC1':'GDP_growth','INDPRO':'IP_growth','CPILFESL':'Inflation','UNRATE':'Unemp_rate'}
final = final.rename(columns = new_names ).dropna()
final.to_hdf('FRED_data.h5','FRED',complevel=6,complib='zlib')
final.to_excel('FRED_data.xlsx')

ax = final[['GDP_growth','IP_growth','Unemp_rate']].plot(subplots=True)
fig = ax[0].get_figure()
from pandas import Series, DataFrame

__author__ = 'wangjj'
__mtime__ = '20161010下午 11:04'
data = {'Scala': 2003, 'Java': 1995, 'Python': 1991, 'Go': 2009}
ser = Series(data)
print ser
print 'C' in ser
print 'Go' in ser
print ser.values.mean()
print '----'
datas = {
    'name': [
        'Wangdachui',
        'Linling',
        'Niuyun'],
    'pay': [
        4000,
        5000,
        6000]}
dataFra=DataFrame(datas)
print DataFrame(datas)
print '----'
print dataFra['name']
print '----'
print dataFra.pay
print '----'
print dataFra.head(2)
print '----'
print dataFra.tail(2)
Example #5
0
# Note: .iloc is strictly integer position based
df.iloc[0:3]

# We can also select using the column name
df['Rev']
df[['Rev','test']]

# df['ColumnName'][inclusive:exclusive]
df['Rev'][0:3]

df['col'][5:]

df[['col', 'test']][:3]

'''
There is also a handy function to select the top and bottom records of a dataframe
'''
# Select top N number of records (default = 5)
df.head()

# Select bottom N number of records (default = 5)
df.tail()








print(iris.DESCR)

iris_data = DataFrame(X, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
iris_target = DataFrame(Y, columns=['Species'])

def flower(num):
    if num == 0:
        return 'Setosa'
    elif num == 1:
        return 'Vericolour'
    else:
        return 'Virginica'

iris_target['Species'] = iris_target['Species'].apply(flower)
print(iris_target.tail())

iris = pd.concat([iris_data, iris_target], axis=1)

sns.pairplot(iris, hue='Species', size=2)
plt.show()

sns.factorplot('Petal Length', data=iris, hue='Species', size=10)
plt.show()

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

logreg = LogisticRegression()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state=3)
Example #7
0
dictSeries=Series(myDict)
dictSeries

#Creating a data frame from dictionary
empDict={'id':[1,2,3,4],'name':   ['Mark','Ian','Sam','Rich'],'isManager':[False,True,False,True]}

## Data Structure : Data Frame from a dictionary
empDict={'id':[1,2,3,4]}
empDf=DataFrame(empDict)

#Access rows and columns 
empDf.name
empDf.name[2]
empDf[empDf.isManager == False]
empDf.head()
empDf.tail()
empDf.iloc[2,]

#Create new column
empDf.append(Series([5,False,'Derek',2],
                    index=['id','isManager','name','deptId'],
ignore_index=True)
empDf

#Deleting a column
empDf['dummy']=1
empDf
del empDf['dummy']
empDf

#Deleting a row
Example #8
0
ddd["retMsg"]
data = ddd["data"]
len(data)
data[0]
data[300]
######################################################
orderbook_json_file = "test1.json"
fp = open(orderbook_json_file, 'r')
json_string = fp.readlines()[0]
fp.close()
orderbook_json_dict = json.loads(json_string.decode('unicode_escape'))
data_list = orderbook_json_dict["data"]
data_list = data_list[300:]
dataSet = DataFrame(data_list)
# dataSet.head(10)
dataSet.tail(10)
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
######################################################
obj = Series([1,2,3])

obj.reindex()

data = DataFrame([[1,2,3],[4,5,6]])
data.drop()

np.argsort()

obj.rank()

obj.sort_values()


data.tail()

data.cov()

data.cov()

data.corr()

data.dropna()

data.loc


data.fillna()

data.unstack()
def zScoreNorm(num):

	return ((num - m)/1.76)

iris_data_c = iris_data
iris_data_c['Petal Length'] = iris_data_c['Petal Length'].apply(zScoreNorm)
norm_zscore_data = iris_data_c['Petal Length']
print "norm_data"
print norm_zscore_data

print iris_data
print iris_target

iris_target['Species'] = iris_target['Species'].apply(flower_type)
print iris_target.head()
print iris_target.tail()


iris = pd.concat([iris_data,iris_target],axis=1)
print iris 


sns.pairplot(iris,hue='Species',size=2)
sns.plt.show()


sns.factorplot('Petal Length',data=iris,hue='Species',size=8,kind='count')
sns.plt.show()


def GetFollowsByCode_InFiles(filelist, code = 'SH600036'):
    global codemarket
    global titleprefix
    # print filelist
    code = CodeName_process(code)
    print 'code:', code
    name, follows_list = GetFollows_InFiles(filelist, code)   
    #print follows_list[-5:-1]
    print name #.decode('gbk')
    pricehistory = get_stock_history_csv(code, name)
    if pricehistory == []:
        print 'Get pricehistory failed. Exit.'
        return
    # print 'follows_list:', follows_list
    follows_chg_list = GetFollows_ProcessList(follows_list, pricehistory) 
    xdata = zip(*follows_chg_list)[0]   #get DataFrame from List
    df = DataFrame(follows_chg_list, index=xdata, columns=['DATE', 'CHG', 'CHG_PCT', 'PRICE', 'VOLUME'])
    # print df
    print df.tail(20)
    # print len(df)
    # print df.CHG.describe()
    CHG_mean = df.CHG.mean()
    print 'CHG_mean', CHG_mean
    # print [CHG_mean for x in range(10)]
    # return  #####
    # fig = plt.figure(figsize=(16,9))
    # fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(16,9))
    fig = plt.figure(figsize=(16,8.5))
    ax0 = fig.add_axes((0.1, 0.2, 0.8, 0.7))     #[left, bottom, width, height]
    
    # ax_left = ax0
    ax_left = df.CHG.plot(ax=ax0, kind='bar', alpha=0.5, align='center', linewidth=2)
    ax0.plot([CHG_mean for x in range(len(df))], 'g--', linewidth=2)
    ax_left.set_ylabel('f')
    ax_right = df.PRICE.plot(ax=ax0, secondary_y=True, color='red', marker='v', linewidth=2, alpha=0.7)
    ax_right.set_ylabel('price')
    
    value_str = str(get_stockinfo_volume(code)[0])+u'亿'
    follow_str = str(df.CHG[-1])+'/'+ str(int(CHG_mean))
    plt.title(df.DATE[-1]+' '+titleprefix+' '+name+code+' v'+value_str+' F'+follow_str)
    plt.xlabel('Date')
    # print type(plt.xlim())
    # print type(xdata), xdata, xdata[0]
    list, listlabel = GetXticksList(xdata)
    ax_left.set_xticks(list)
    ax_left.set_xticklabels([]) #(listlabel, fontsize='small')
    # plt.legend()
    # fig.autofmt_xdate()
    # ax1.set_title('volume')
    # plt.subplot(223, axisbg='r')
    ax1 = fig.add_axes((0.1, 0.05, 0.8, 0.15), sharex=ax0)
    
    ax_volume = df.VOLUME.plot(ax=ax1, kind='bar', color='green', linewidth=1, alpha=0.7)
    ax_volume.set_xticklabels([])
    ax_volume.set_xticklabels(listlabel, fontsize='small')
    ax_volume.set_xticks(list)
    ax_volume.set_ylabel('volume')
    ax1.plot([df.VOLUME.mean() for x in range(len(df))], 'g--', linewidth=2)
    
    # fig.subplots_adjust(bottom=0.8)
    # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
    # fig.colorbar(im, cax=cbar_ax)
    
    if not savepng:
        plt.show()
    else:
        fig.savefig(save_fname) #, dpi=140)
Example #12
0
class Results:

    """
    A very simple database of results with a notification for new results.
    The new results are fed directly by the :class:`.StrategyBase`, outside of the
    :class:`.EventBus`.

    .. Note::

      Later on, maybe this will be a cool actual database which allows to
      persistently store past evaluations for a given problem.
      This would allow resuming and further a-posteriory analysis.
      In the meantime, this is a pandas DataFrame.
    """

    def __init__(self, strategy):
        self.logger = strategy.config.get_logger('RSLTS')
        self.strategy = strategy
        self.eventbus = strategy.eventbus
        self.problem = strategy.problem
        self.results = None
        self._last_nb = 0  # for logging

    def add_results(self, new_results):
        """
        Add one single or a list of new @Result objects.
        Then, publish a ``new_result`` event.
        """
        from pandas import (DataFrame, MultiIndex)
        if self.results is None:
            if len(new_results) == 0:
                return
            r = new_results[0]
            midx_x = [('x', _) for _ in range(len(r.x))]
            len_cv_vec = 0 if r.cv_vec is None else len(r.cv_vec)
            midx_cv = [('cv', _) for _ in range(len_cv_vec)]
            midx = MultiIndex.from_tuples(
                midx_x + [('fx', 0)] +
                midx_cv + [('cv', 0), ('who', 0), ('error', 0)])
            self.results = DataFrame(columns=midx)

        assert all([isinstance(_, Result) for _ in new_results])
        # notification for all received results at once
        self.eventbus.publish("new_results", results=new_results)

        new_rows = []
        for r in new_results:
            new_rows.append(
                np.r_[r.x, r.fx,
                      [] if r.cv_vec is None else r.cv_vec,
                      [r.cv, r.who, r.error]])
        results_new = DataFrame(new_rows, columns=self.results.columns)
        self.results = self.results.append(results_new, ignore_index=True)

        if len(self.results) / 100 > self._last_nb / 100:
            self.info()
            self._last_nb = len(self.results)

    def info(self):
        self.logger.info("%d results in DB" % len(self))
        self.logger.debug("Dataframe Results:\n%s" % self.results.tail(3))

    def __iadd__(self, results):
        self.add_results(results)
        return self

    def __len__(self):
        return len(self.results) if self.results is not None else 0
Example #13
0
#2.3创建时间序列
#pandas.date_range(start=None, end=None, periods=None, freq='D',
#   tz=None, normalize=False, name=None, closed=None, **kwargs)
dates=pd.date_range('20180101',periods=12,freq='m')
print (dates)

np.random.seed(5)
df=pd.DataFrame(np.random.randn(12,4),index=dates,
                 columns=list('ABCD'))
df

#查看数据头n行 ,默认n=5
df.head()

#查看数据最后3行
df.tail(3)

#查看数据的index(索引),columns (列名)和数据
print(df.index)

print(df.columns)

print(df.values)

#数据转置
# df.T

#根据索引排序数据排序:(按行axis=0或列axis=1)
df.sort_index(axis=1,ascending=False)

#按某列的值排序
prop_c_male('Leslie').plot()

# <codecell>

# I couldn't figure out a way of iterating over the names rather than names/sex combo in
# a vectorized way.  

from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M']/(ambi_names_pt[name_]['F'] + ambi_names_pt[name_]['M']))  \
     for name_ in names_to_calc]
p_m_instant = DataFrame(dict(m))
p_m_instant.tail()

# <codecell>

# similar calculation except instead of looking at the proportions for a given year only,
# we look at the cumulative number of male/female babies for given name

from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M'].cumsum()/(ambi_names_pt[name_]['F'].cumsum() + ambi_names_pt[name_]['M'].cumsum()))  \
     for name_ in names_to_calc]
p_m_cum = DataFrame(dict(m))
p_m_cum.tail()
Example #15
0
df.to_csv('births1880.txt',index=False,header=False)

Location = r'births1880.txt'

df = read_csv(Location)

print df

print df.head()

df = read_csv(Location,header=None)

print df

print df.tail()

df = read_csv(Location, names = ['Names','Births'])

print df.head()

import os

os.remove(Location)

print df['Names'].unique()

for x in df['Names'].unique():
    print x

print df['Names'].describe()
Example #16
0
# In[195]:

KM=[kmeans(X,k) for k in K]
print type(KM),len(KM)


# In[196]:

KM_df=DataFrame(KM)
print KM_df.head(1)


# In[197]:

print KM_df.tail(1)


# In[198]:

KM_df.shape


# In[199]:

KM_v1=KM_df[0]
print type(KM_v1)


# In[200]:
Example #17
0
import xlrd  # xlsを読み込む際に必要
import numpy as np
import sqlite3

# データフレームを作る
smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.6, 1.7, 3.5, 4.3]
       }
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame['year']  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(
    smp, index=['one', 'two', 'three', 'four', 'five'])  # インデックスを追加
frame2.ix['one']
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv('stock_px.csv')
print(data)
xlsx_file = pd.ExcelFile('stock_px.xlsx')  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse('stock_px')
print(data)

# web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
avg_site = np.array([])
for i in site:
    avg_site = np.append(avg_site, np.mean(train["click"][train["site_category"] == i]))


# In[14]:

site_df = DataFrame({"site": site, "avg_click": avg_site})
site_df = site_df.sort(columns="avg_click")
plt.plot(range(len(site_df)), site_df["avg_click"], "bo", range(len(site_df)), site_df["avg_click"], "k")


# In[17]:

site_df.tail(2)


# In[21]:

t1["special_site"] = 0
t2["special_site"] = 0
test["special_site"] = 0
t1["special_site"][t1["site_category"] == "dedf689d"] = 1
t2["special_site"][t2["site_category"] == "dedf689d"] = 1
test["special_site"][test["site_category"] == "dedf689d"] = 1


# In[22]:

print sum(t1["special_site"]), sum(t2["special_site"]), sum(test["special_site"])