Example #1
0
    def _distance_matrix_to_header(
            distance_matrix: pd.DataFrame) -> pd.DataFrame:
        """Converts a distance matrix to a header.

        Args:
            distance_matrix: as index the columns of the vector numeric data, as
                columns the blm names and contains the "distance" between each.

        Returns:
            The header, a list of blm names.
        """
        return distance_matrix.idxmin(axis=1).tolist()
Example #2
0
def align_dfq(dfq: pd.DataFrame) -> pd.DataFrame:
    """Perform time (x axis) and frequency (y axis) shifting to align QMB pulses"""

    tmin = dfq['pulse0'].idxmin()
    tshift = tmin - dfq.idxmin()
    refPulse = tshift.idxmin()
    taxis = dfq.index + tshift[refPulse]
    print(refPulse, taxis)
    dfqShift = pd.DataFrame(dfq['pulse0'])
    dfqShift.index = list(taxis)
    fref = dfq['pulse0'][0]

    for pul in dfq.columns:
        fshift = fref - dfq[pul][0]
        dfqShift[pul] = pd.Series(dfq[pul].values + fshift)
        dfqShift[pul] = dfqShift[pul].shift(-dfq[pul].idxmin())
    return dfqShift
import pandas as pd

arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])

dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three'])
dframe1

# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb
# import pandas_datareader.data as pdweb
import datetime
Example #4
0
ser2.rank()
# sort put a series in order of it's item ranks

##### Summary
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
arr
df1 = DataFrame(arr,index = ['a','b'],columns = ['one','two','three'])
df1

df1.sum() #default axis is 0, and Pandas will ignore the nan values
df1.sum(axis=1) #by row

df1
df1.min()
df1.min(axis=1)
df1.idxmin()
df1.idxmin(axis=1)

df1.cumsum() # accumulation sum

### unique() and value_count() methods for factor variables
ser1 = Series(['w','w','x','y','z','w','x','y','x','a'])
ser1
ser1.unique()
ser1.value_counts()

###describe method
df1.describe() # similar to the summary() method in R will provide summary stat.

###covariance matrices and some visulaization
import pandas.io.data as pdweb
Example #5
0

df2['Chinese'] = df2['Chinese'].astype(np.int64)
df = df2.apply(plus, axis=1, args=[1, 2])
print(df)

# 常用的统计函数
print('-' * 35 + '常用的统计函数' + '-' * 35)
# count()统计个数,空值不计算
print(df2.count())
print(df2.count(axis=1))

# describe():输出多个统计指标
print(df2.describe())
print(df2.min())
print(score.idxmin())
print(score.std())
print(df2.describe(percentiles=[0.9]))

# 数据表合并
print('-' * 35 + '数据表合并' + '-' * 35)
df1 = DataFrame({
    'name': ['ZhangFei', 'GuanYu', 'a', 'b', 'c'],
    'data': range(5),
    'data1': range(0, 9, 2)
})
df2 = DataFrame({
    'name': ['ZhangFei', 'GuanYu', 'A', 'B', 'C'],
    'data': range(5),
    'data2': range(5)
})
Example #6
0
######################################################################
# SUMMARY STATISTICS
######################################################################

# Will return the sum for each colm, ignores NaN
dframe.sum()

# Will return sum of Rows
dframe.sum(axis=1)

# Min/max val for each col
dframe.min()

# Min/max val index for each col
dframe.idxmin()

# Cumulation sum
dframe.cumsum()

# Describe method creates summary statistics for each colm
dframe.descirbe() 	#count, mean, std, min, ....

# Covariance and Correlation
import pandas.io.data as pdweb
import datetime


# Getting the stock data from the internet and displaying the first 5 sets
prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],start=datetime.datetime(2010,1,1),
	end=datetime.datetime(2013,1,1))['Adj Close']
from pandas import DataFrame

data2 = {
    'Speed': [101, 109, 106],
    'Temp': [34, 23, 42],
    'Humidity': [45, 23, 58]
}
frame2 = DataFrame(data2)

print(frame2)

print(frame2.sum())  # Finding sum of attributes

print(frame2.sum(axis=1))  # Finding sum of tuples

print(frame2.idxmax())  # Which tuple has the max Speed, Humidity and Temp (returns respective indices
print(frame2.idxmin())
print(frame2.max())  # Which is the maximum recorded value of Hum, Temp, Speed
Example #8
0
print(df.drop(1), '\n')  # 1행지우기
print(df.dropna(), '\n')  #nan값을 지운다.
print(df.dropna(how='any'), '\n')  # nan값이 하나라도 있으면 지운다
print(df.dropna(how='all'), '\n')  # 모든행의 값이 nan 이면 지운다
print(df.dropna(subset=['one']), '\n')  # 특정열에 nan 이 있으면 그행을 제거한다.
print(df.fillna(0), '\n')  # 평균으로 채우기 sklearn 모듈의 SimpleInputer

# 기술적 통계와 관련된 함수
print('**' * 10)
print(df.sum(), '\n')  #열단위의 합 nan은 제외
print(df.sum(axis=0), '\n')

print(df.sum(axis=1), '\n')  # 행단위의 합
print(df.mean(axis=1), '\n')  # 행의 평균
print(df.mean(axis=1, skipna=True), '\n')  # na포함 계산
print(df.mean(axis=1, skipna=False), '\n')  # na 있을시 계산 x

print(df.mean(axis=0, skipna=True), '\n')  # nan이 있어도 계산 o (열단위)
print(df.mean(axis=0, skipna=False), '\n')  #nan이 있기 때문에 계산 x

print(df.max(), '\n')
print(df.max(axis=0), '\n')  #열값중 가장 큰값
print(df.idxmax(), '\n')
print(df.idxmin(), '\n')

print(df.describe(), '\n')  # 요약 통계망
print(df.info(), '\n')  # 데이터프레임 구조

words = Series(['봄', '여름'])
print(words.describe(), '\n')
Example #9
0
obj.rank(ascending=False)
obj.rank()
obj.rank(method='first')

frame3 = DataFrame({
    'b': [4, 7, 3, 2],
    'a': [4, 9, 2, 5],
    'c': [5, 3, 7, np.nan]
})
frame3.rank(axis=1)
frame3.sum()
frame3.mean()

frame3.sum(skipna=False)
frame3.idxmax()  # 최대치가 있는 인댁스값
frame3.idxmin()

obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
ubiques = obj.unique()
obj.value_counts
obj.value_counts(sort=False)

mask = obj.isin(['b', 'c'])
obj[mask]  # True인 값만 출력

obj[obj.isin(['b', 'c'])]
frame4 = DataFrame({
    'X': ['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'],
    'Y': ['f', 'g', 'd', 'g', 'h', 'e', 'd', 'h', 'f'],
    'Z': ['a', 'e', 'd', 'g', 'd', 'e', 'q', 'b', 'c']
})
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
df1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three'])

# sum of columns
print(df1)
print(df1.sum())  # sum by column
print(df1.sum(axis=1))  # sum by row

# min/max
print(df1.min())  # return the min value for each column
print(df1.min(axis=1))  # return the min value for each row
print(df1.idxmin())  # return the index column instead

# stats
print(df1.describe())
Summarizing and Computing Descriptive Statistics
"""
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
df = DataFrame([[1.4, np.nan],
                [7.1, -4.5],
                [np.nan, np.nan],
                [0.75, -1.3]],
                index=['a', 'b', 'c', 'd'],
                columns=['one', 'two'])
df.sum() 
df.sum(axis=1) #sum along axis=1, columns 
df.mean(axis=1, skipna=False)
df.idxmax()
df.idxmin(axis=1)              
df.cumsum()
df.describe()
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()
df['three'] = ['a','b','c','a']
df.describe()
df['three'].describe()

"""
Correlation and Covariance
"""
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
class KMeansPlusPlus:

    def __init__(self, data_frame, k, columns=None, max_iterations=None,
                 appended_column_name=None):
        if not isinstance(data_frame, DataFrame):
            raise Exception("data_frame argument is not a pandas DataFrame")
        elif data_frame.empty:
            raise Exception("The given data frame is empty")

        if max_iterations is not None and max_iterations <= 0:
            raise Exception("max_iterations must be positive!")

        if not isinstance(k, Integral) or k <= 0:
            raise Exception("The value of k must be a positive integer")

        self.data_frame = data_frame  # m x n
        self.numRows = data_frame.shape[0]  # m
        # self.m = self.get_metric()
        # k x n, the i,j entry being the jth coordinate of center i
        self.centers = None
        self.index = []
        # m x k , the i,j entry represents the distance
        # from point i to center j
        # (where i and j start at 0)
        self.distance_matrix = None

        # Series of length m, consisting of integers 0,1,...,k-1
        self.clusters = None

        # To keep track of clusters in the previous iteration
        self.previous_clusters = None

        self.max_iterations = max_iterations
        self.appended_column_name = appended_column_name
        self.k = k
        # print ("df: ",self.data_frame.shape)

        if columns is None:
            self.columns = data_frame.columns
        else:
            for col in columns:
                if col not in data_frame.columns:
                    raise Exception(
                        "Column '%s' not found in the given DataFrame" % col)
                if not self._is_numeric(col):
                    raise Exception(
                        "The column '%s' is either not numeric or contains NaN values" % col)
            self.columns = columns

    def populate_initial_centers(self):
        rows = []
        # rows.append(self.data_frame.iloc[0,:])
        rows.append(self._grab_random_point())
        distances = None

        while len(rows) < self.k:
            if distances is None:
                distances = self.get_dist_point(self.data_frame ,rows[0])
            else:
                distances = self.get_dist_list(self.data_frame, rows)

            '''array / array'''
            normalized_distances = distances / distances.sum()
            index = random.choices(range(0,self.numRows), weights=normalized_distances, k=1)
            self.index.append(index)
            centroid = self.data_frame[self.columns].iloc[index, :].values
            rows.append(centroid.reshape(centroid.shape[1]))
        self.centers = DataFrame(rows, columns=self.columns)
        # print(self.centers)

    def _grab_random_point(self):  # get values of an row
        index = np.random.random_integers(0, self.numRows - 1)
        self.index.append(index)
        return self.data_frame[self.columns].iloc[index, :].values

    def get_dist_single(self, x, y): #x is a sample, y is a the centroid of cluster
        # print ("X ",x)
        # print("Y ", y)

        Z = np.array([x - y])
        # print(Z)
        dist = np.sum(Z != 0)
        # print(dist)
        return dist

    def get_dist_point(self, X, Y):
        dist_lst = []
        # print ("len ", len(X))
        for i in range(len(X)):
            x = np.array(X.iloc[i])
            # y = np.array(Y.iloc[i])
            y = Y
            dist_lst.append(self.get_dist_single(x, y))
        l = np.array(dist_lst).reshape((len(dist_lst),1))
        # return dist.values
        return l

    def get_dist_list(self, X, Y):
        result = None

        for point in Y:
            if result is None:
                # result = self.get_dist_point(X, point.values)
                result = self.get_dist_point(X, point)
            else:
                # print ("r ",result)
                # print("")
                l = self.get_dist_point(X, point)
                result = np.concatenate((result, l), axis=1)
        result = result.min(axis=1)
        return result

    def compute_distances(self):
        # dis_mat = DataFrame()
        dis_mat = np.zeros((len(self.data_frame.index),1))

        for i in range(self.k):
            d = self.centers.iloc[i,:] - self.data_frame
            # print(d)
            # print(d!=0)
            dist = np.sum(d!=0,axis=1)
            # print(dist)
            dis_mat = np.concatenate((dis_mat, dist.reshape(dist.shape[0],1)),axis=1)
        # print (dis_mat)
        dis_mat = np.delete(dis_mat,0,1)
        # print(dis_mat)
        self.distance_matrix = DataFrame(
            dis_mat, columns=list(range(self.k)))

    def get_clusters(self):
        if self.distance_matrix is None:
            raise Exception(
                "Must compute distances before closest centers can be calculated")

        # min_distances = self.distance_matrix.min(axis=1)
        #
        # # We need to make sure the index
        # min_distances.index = list(range(self.numRows))
        self.clusters = Series(self.distance_matrix.idxmin(axis=1).values, index=self.data_frame.index)

    def compute_new_centers(self):
        if self.centers is None:
            raise Exception("Centers not initialized!")

        if self.clusters is None:
            raise Exception("Clusters not computed!")

        # print(self.data_frame)
        # data
        for i in list(range(self.k)):
            self.centers.ix[i, :] = self.data_frame[
                self.columns].ix[self.clusters == i].mean()
        # print("before ",self.centers)
        # self.centers = self.centers.astype(int)
        try:
            self.centers =  self.centers.astype(int)
        except ValueError:


            index = (np.sum(self.distance_matrix, axis=1) == 9).values.nonzero()[0]
            # print(type(index))
            # print((np.sum(self.distance_matrix, axis=1) == 9).nonzero()[0])
            # asdf
            # print("index, ",index)
            # print("before ",self.centers)
            # self.distance_matrix
            ind =np.random.choice(index, int(len(index)/10))
            # print("ind", ind)
            self.clusters[ind] = 2
            for i in list(range(self.k)):
                self.centers.ix[i, :] = self.data_frame[
                    self.columns].ix[self.clusters == i].mean()
            # self.centers[np.sum(self.distance_matrix,axis=1) == 9]
            self.centers = self.centers.astype(int)
            # print(self.centers)

    def cluster(self):

        self.populate_initial_centers()
        self.compute_distances()
        self.get_clusters()

        counter = 0

        while True:
            counter += 1

            self.previous_clusters = self.clusters.copy()

            self.compute_new_centers()
            # print(self.centers)
            self.compute_distances()
            # print(self.distance_matrix)
            self.get_clusters()
            # print(self.clusters)

            if self.max_iterations is not None and counter >= self.max_iterations:
                break
            elif all(self.clusters == self.previous_clusters):
                break

        if self.appended_column_name is not None:
            try:
                self.data_frame[self.appended_column_name] = self.clusters
            except:
                warnings.warn(
                    "Unable to append a column named %s to your data." %
                    self.appended_column_name)
                warnings.warn(
                    "However, the clusters are available via the cluster attribute")
Example #13
0
__author__ = 'Executor'

import numpy as np
import pandas as pa
from pandas import Series, DataFrame


arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three'])
print(dframe1.sum())
print(dframe1.sum(axis=1))
print(dframe1.min())
print(dframe1)
print(dframe1.idxmin())

print(dframe1)
print(dframe1.cumsum())

print(dframe1.describe())

from IPython.display import YouTubeVideo
YouTubeVideo('xGbpuFNR1ME')
YouTubeVideo('4EXNedimDMs')

''' stupid thing doesn't work!'''
#This will show the minimum values of the column
print "Minimun"
print df1.min()

#This will show the maximum values of the column
print "Maximum"
print df1.max()

#This will show the minimum and maximum values of the index
# It does not show the value but shows the index/ row of the values with the min and max values
print "Max of index"
print df1.idxmax()

print "Min of index"
print df1.idxmin()

# Cumilitive Sums leave the first index as it is
# and adds the second index with the first indexx to get the outcome
print "Cumulitive Sum"
print df1.cumsum()

# Describe function helps with oral sets such as:
# count, mean, standard deviation, minimum, mpercentages (25%, 50%, 75%) and max
print "Describe Function"
print df1.describe()

#this dataframe looks at random numbers with a 3*3 grid
# it has an index of 123 and column ABC
df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC'))
print "New DataFrame"
Example #15
0
print(df.dropna(how='all'))       #모두 결측치인 행 삭제
print(df.dropna(subset=['1st']))#칼럼명이  1st열에서 NaN이 있으면 그 행을 삭제
print(df.fillna(0))   #NaN을 0으로 채우기, 평균으로 채우기는 sklearn 모듈의 SimpleInputer를 이용

'''
기술적 통계와 관련된 함수
axis=1은 행, axis=0은 열
'''
print(df.sum())            #NaN은 연산에서 제외, 열의 합 (같은의미 =>df.sum(axis=0))
print(df.sum(axis=1))   #행의 합, NaN끼리의 연산은 0으로 처리
print(df.mean(axis=1)) #행의 평균 (같은 의미 =>df.mean(axis=1, skipna=True), Na빼고 구하려면 skipna를 False로 처리

#최대값
print(df.max())     # => axis=0
print(df.idxmax()) #최대값을 가진 인덱스를 반환
print(df.idxmin()) #최소값도 동일

#요약통계량
print(df.describe())
'''
            1st       2nd
count  3.000000  2.000000
mean   2.966667 -2.750000
std    3.521837  2.474874
min    0.500000 -4.500000
25%    0.950000 -3.625000
50%    1.400000 -2.750000
75%    4.200000 -1.875000
max    7.000000 -1.000000
'''
# In[209]:

df.sum()

# In[220]:

df.sum(axis=1)

# In[221]:

df.mean(axis=1, skipna=False)

# In[223]:

df.idxmin()

# In[224]:

df.idxmax()

# In[226]:

df.cumsum()

# In[227]:

df.describe()

# In[229]:
def find_min(df: pd.DataFrame):
    idx_min = int(df.idxmin(axis=0)[2])
    min_row = df.loc[idx_min, :]
    r_min, theta_min, E_min = min_row[0], min_row[1], min_row[2]
    return r_min, theta_min, E_min
print(df1)

#sum operations

print(df1.sum())

#sum along the rows
print(df1.sum(axis=1))

print(df1.min())
print(df1.max())

print('----------')

print(df1.idxmax())
print(df1.idxmin())
print('----------')

print(df1.cumsum())
print('----------')

print(df1.describe())
print('----------')

df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC'))
print(df2)
print('----------')

plt.plot(df2)
plt.legend(df2.columns, loc="lower right")
plt.savefig("first graph in python")
import pandas as pd

arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])

dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"])
dframe1

# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb

# import pandas_datareader.data as pdweb
Example #20
0
ser1 = Series(range(3), index=['C', 'A', 'B'])
ser1

ser1.sort_index()
ser1.sort_values()

ser2 = Series(randn(10))
ser2

arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
arr
dframe1 = DataFrame(arr, index=['A', 'B'], columns=['one', 'two', 'three'])
dframe1
dframe1.sum(axis=1)
dframe1.min(axis=1)
dframe1.idxmin(axis=0)
dframe1
dframe1.cumsum()
dframe.cumsum()

prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],
                              start=datetime.datetime(2010, 1, 1),
                              end=datetime.datetime(2013, 1, 1))['Adj Close']
prices.head()

valume = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],
                              start=datetime.datetime(2010, 1, 1),
                              end=datetime.datetime(2013, 1, 1))['Volume']

valume.head()
rets = prices.pct_change()
Example #21
0
# ### Summary Statistics

# In[449]:

df = DataFrame(np.arange(16).reshape(4, 4),
               columns=list('ABCD'),
               index=list('PQRS'))
df.loc['P', 'D'] = np.nan
df.loc['R', 'A'] = np.nan
df

df.sum()  # col
df.sum(axis=1)  # row

df.min()  # col
df.idxmin()  # col

df.min(axis=1)  # row
df.idxmin(axis=1)  #row

df.cumsum()

# In[452]:

# describe

df = DataFrame(np.arange(16).reshape(4, 4),
               columns=list('ABCD'),
               index=list('PQRS'))
df.loc['P', 'D'] = np.nan
df.loc['R', 'A'] = np.nan
Example #22
0
import pandas as pd
import numpy as np

# datafile = 'D:/新建 Microsoft Office Excel 工作表.xlsx'
# data = pd.read_excel(datafile,header=None)
# min = (data-data.min())/(data.max()-data.min())
# zero = (data - data.mean())/data.std()
# float = data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化
# print("原始数据为:\n",data)
# print('--------------------')
# print('最小-最大规范化后的数据:\n',min)

from pandas import Series, DataFrame

df = DataFrame(np.random.randn(4, 3), index=list('abcd'), columns=['frist', 'second', 'third'])
print(df)
print(df.describe())
print(df.sum())
print(df.sum(axis=1))
print('-----------')
print(df.idxmax(), df.idxmin(), df.idxmin(axis=1))
print(df.cumsum())
print(df.var())
print(df.std())
print(df.pct_change())
print(df.cov())
print(df.corr())
Example #23
0
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df)

# sum() : 각 컬럼의 합을 더해서 Series 객체를 반환
print(df.sum())
print(df.sum(axis=1))  # 각 행의 합을 반환

# 전체 행이나 컬럼의 값이 NA가 아니라면 NA 값은 제외시키고 계산을 하는데
# skipna 옵션은 전체 행이나 컬럼의 값이 NA가 아니라도 제외시키지 않을 수 있다.
# skipna의 기본값은 True
print(df.sum(axis=1, skipna=False))

# idxmin, idxmax와 같은 메서드는 최소, 최대값을 가지고 있는 색인 값 같은 간접 통계를 반환한다.
print(df.idxmax())
print(df.idxmin())

# 누산 메서드 : cumsum()
print(df.cumsum())

# unique() : 중복된 값을 하나로 묶음
s1 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(s1.unique())

# value_counts() : 값의 수를 계산(도수, 카운팅), 반환값은 Series 객체
print(s1.value_counts())  # 결과값이 내림차순으로 출력됨.

# isin() : 어떤 값이 Series에 있는지 나타내는 메서드
## boolean type(True, False)을 반환한다.
mask = s1.isin(['b', 'c'])
print(mask)
frame[0:3]
frame.loc['a':'d', 'STL':]
frame.iloc[0:3, 1:2]
frame['UMST'] = 4
frame.reindex(index=['c', 'e', 'a'], columns=['UM', 'Washu'])
frame[frame < 0] = np.nan
frame.isnull()
frame.dropna()
frame.dropna(axis=1)
um = frame['UM']
um[um.notnull()]
frame.fillna(method='ffill', axis=0, limit=1, inplace=False)
frame.fillna(method='ffill', axis=1, limit=1)
frame.mean()
frame.mean(axis=1, skipna=False)
frame.idxmin()
frame.idxmax(axis=1)
frame2 = DataFrame(
    {
        'Washu': np.random.randn(5),
        'UM': np.random.randn(5),
        'UMST': np.random.randn(5)
    },
    index=list('abcde'))
frame3 = DataFrame({
    'a': {
        'Washu': 1,
        'UM': 3
    },
    'b': {
        'Washu': 2,
Example #25
0
from pandas import DataFrame

data = {
    'Speed': [101, 109, 106],
    'Temp': [34, 32, 45],
    'Humidity': [4500, 2300, 5800]
}
frame = DataFrame(data)
print(frame)

print(frame.sum())  #to calculate sum of all columns

print(frame.sum(axis=1))  # to calculate sum of rows

print(frame.idxmax())  # to calculate max value at particular index value.

print(frame.idxmin())
Example #26
0
print(stu_result)

# 我们先复习了下对DataFrame的操作
# 现在我们有了一个迷你的学生成绩单

# 输出每列的和
stu_pure_result = stu_result.ix[:, ['math', 'physics']]
stu_pure_result = DataFrame(stu_pure_result)

print(stu_pure_result.sum())
print(stu_pure_result.sum(axis = 1))

# 求最大值的索引
print(stu_pure_result.idxmax())
print(stu_pure_result.idxmin())

# 求累计值
print(stu_pure_result.cumsum())
print(stu_pure_result.cumprod())
print(stu_pure_result.cummax())
print(stu_pure_result.cummin())

# 一次性产生一系列描述性统计结果
print(stu_pure_result.describe())

# Series也有describe()方法
history = Series([97, 99, 89, 79], index=range(4))
print(history.describe())
# 需要注意,describe()方法最好处理数值型数据,如果是其他类型数据则数据意义不大
df1.sum()

#calculating the sum of individual rows

df1.sum(axis=1)
#here axis =1 represents the horizontal axis

#calculating the maximum values for each individual columns
#results will in the form of displayed index

df1.idxmax()

#similarly for minimum values for each individual columns

df1.idxmin()

#fundamental operations on DataFrames like addition,subtraction etc

dic2 = {
    "cse": [10, 13, 11],
    "maths": [11, 14, 17],
    "english": [5, 7, 9],
    "ece": [11, 13, 15]
}
df2 = DataFrame(dic2)

##adding df1+df2

df1 + df2
Example #28
0
#Dataframe
Fails = {
    'Cv-Folds': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'H1': errors[:, 0],
    'H2': errors[:, 1],
    'H3': errors[:, 2],
    'H4': errors[:, 3],
    'H5': errors[:, 4],
    'H6': errors[:, 5],
    'H7': errors[:, 6],
    'H8': errors[:, 7],
    'H9': errors[:, 8],
    'H10': errors[:, 9]
}
Errordf = DataFrame(Fails,
                    columns=[
                        'Cv-Folds', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7',
                        'H8', 'H9', 'H10'
                    ])

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(Errordf)

#finding the minimum error, and the number of hidden units for that error.
minValues = Errordf.min()
minIndex = Errordf.idxmin(axis=0)

WubWub = {'hidden units': minIndex + 1, 'E-test': minValues}
endData = DataFrame(WubWub, columns=['hidden units', 'E-test'])
print(endData)
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
Example #30
0
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
dframe1 =  DataFrame(arr,columns=["One","Two","Three"],index=["A","B"] )
dframe1

#Let's see the sum() method in action
dframe1.sum()
#Notice how it ignores NaN values

#Notice how it ignores NaN values
dframe1.sum(axis=1)

#Can also grab min and max values of dataframe
dframe1.min()

#As well as there index
dframe1.idxmin()

dframe1.idxmax()

dframe1.max()

dframe1
#Can also do an accumulation sum
dframe1.cumsum()

#A very useful feature is describe, which provides summary statistics
describe=dframe1.describe()

# We can also get information on correlation and covariance

#For more info on correlation and covariance, check out the videos below!
Example #31
0
#Lecture 22 Summary Statistics
import numpy as np
import pandas as pd
import IPython
from pandas import Series, DataFrame
arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
dframe1 = DataFrame(arr, index=['A', 'B'], columns=['one', 'two', 'three'])
print(dframe1)
print(dframe1.sum())  # sums values across each column
print(
    dframe1.sum(axis=0)
)  # sums the value across each row. for Row use axis =1 , for column use axis =0
print(dframe1.min())  # returns min value in column
print(dframe1.max())  # returns max value in column
print(dframe1.idxmin())  # returns index of the min value in column
print(
    dframe1.cumsum())  #accumulation row wise cumulative summin across column.
print(dframe1.describe()
      )  #summary statiscs for data frame . Min , Max , ount , percentile
# from IPython.display import YouTubeVideo
# YouTubeVideo('xGbpuFNR1ME')
# YouTubeVideo('4EXNedimDMs')
from pandas_datareader import data  #allow us to get some information from the web
import datetime  # Library for date input
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
prices = data.get_data_yahoo(
    ['CVX', 'XOM', 'BP'],
    start=datetime.datetime(2010, 1, 1),
    end=datetime.datetime(2013, 1,
A    3.0
B    7.0
dtype: float64
'''

# min
print(dframe1.min())
'''
One      1.0
Two      2.0
Three    4.0
dtype: float64
'''

# index of min value
print(dframe1.idxmin())
'''
One      A
Two      A
Three    B
dtype: object
'''

# acumulation
print(dframe1.cumsum())
'''
   One  Two  Three
A  1.0  2.0    NaN
B  NaN  5.0    4.0
'''
Example #33
0
df

pd.isnull(df)
df.isnull()

df.sum()
df.sum(axis = 1)
df.mean()
df.mean(skipna = False)
df.mean(axis = 1)
df.mean(axis = 1, skipna = False)

np.mean(df, axis = 1)

df.idxmax()  # 열기준 최고값 인덱스 : 과목별 고득점자
df.idxmin()  # 열기준 최소값 인덱스 : 과목별 저득점자

df.cumsum()  # row단위 누적합
df.cumsum(axis = 1)  # col단위 누적합

df['영어'].sum()
df['영어'].mean()
df['영어'].var()
df['영어'].std()
df['영어'].max()
df['영어'].min()

df.loc['홍길동'].sum()
df.loc['박찬호'].mean()
df.describe()
Example #34
0
frame2
frame2.sort_index(axis=1, ascending=False)

series2=Series([100,200,500,50],index=['S',['p','o','u']])
series2
series2.sort_values()

frame2.sort_value (by='Humidity')

#check for duplicate
series.index.is_unique
#sum
frame2.sum()
frame2.sum(axis=1)
frame2.idxmax()
frame2.idxmin()
#removing nan
from pandas import Series
import  numpy as np
ser = Series([1,2,3,4,np.nan],index=['a','b','c','d','e'])
ser
ser = ser.dropna()
ser
frame2 = frame2.dropna()
#fillna value with 0
frame2.fillna(0)
frame2.fillna(100)
#loading data from file
import  pandas
data_frame = pandas.read_csv("PMTCT.csv")
data_frame
Example #35
-1
def descriptiveStatsDataFrame():
    df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two'])
    print (df)
    print ('Column Sum: \n{}'.format(df.sum(axis=0)))
    print ('Row Sum: \n{}'.format(df.sum(axis=1)))
    print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False)))
    print ('Index with min Value: \n{}'.format(df.idxmin()))
    print ('Summary Statistic: \n{}'.format(df.describe()))