def multi_index(): #df=DataFrame(np.random.randn(10),index=[['a','b','a','c','a','b','c','a','a','c'], df = DataFrame(np.random.randn(16).reshape(4, 4), index=[['a', 'b', 'a', 'c'], [1, 2, 3, 2]], columns=[["Hot", "Cold", "Hot", "Cold"], ["Good", "Bad", "Bad", "Good"]]) print df print df.index df2 = DataFrame(np.arange(16).reshape(4, 4), index=["1", '2', '3', '4'], columns=["A", "B", "C", "D"]) print df2 print df2.icol(2) print df2.ix['3'] print df2['C']
def multi_index(): #df=DataFrame(np.random.randn(10),index=[['a','b','a','c','a','b','c','a','a','c'], df = DataFrame(np.random.randn(16).reshape(4, 4), index=[['a', 'b', 'a', 'c'], [1, 2, 3, 2]], columns=[["Hot", "Cold", "Hot", "Cold"], ["Good", "Bad", "Bad", "Good"]]) print(df) print(df.index) df2 = DataFrame(np.arange(16).reshape(4, 4), index=["1", '2', '3', '4'], columns=["A", "B", "C", "D"]) print(df2) print(df2.icol(2)) print(df2.ix['3']) print(df2['C'])
Series([1, 2, 3]), Series(['John', 'Amy', 'Mark']), Series([True, False, True]) ]) DataFrame([[1, 2, 3], ['J', 'A', 'M'], [True, False, True]]) 'c' in frame2.index 'Washu' in frame2.columns 'Washu' in frame2 frame2.drop('d') frame2.drop('Washu', axis=1) frame2[frame2['Washu'] > 0] frame2 = frame2.fillna(0) frame2.ix[frame2.Washu > 0, frame2.ix['d'] > 0] frame2.xs('d') frame2.xs('UM', axis=1) frame2.icol(2) frame2.irow(4) frame2.add(frame3, fill_value=0) frame2.applymap(lambda x: '%.2f' % x) df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) df.index.is_unique df.index.unique() df.ix['a'] frame2.describe() """ Unique values, value counts, membership Not unique indices """ obj = Series(list('cbdaabbcc')) obj.unique() s = obj.value_counts()
def looding(): start = clock() data1 = '0814' data2 = '0811' dflte = read_csv(ur'4G指标%s.csv' % data1, skiprows=[0], header=None, encoding="gbk") dfwcdma = read_csv(ur'%swcdma.csv' % data1, skiprows=[0], header=None, encoding="gbk") dfgsm = read_excel(ur'%sgsm.xls' % data1, skiprows=[0], header=None, sheetname='Sheet') dfnamelist = read_excel(ur'校园清单0802.xlsx', skiprows=[0], header=None) end = clock() print str((end - start) / 60) + 'mins' print u'完成读取' start = clock() dflte[11] = dflte[7].astype(float) + dflte[8].astype(float) dflte[11] = dflte[11] / (1024 * 1024) dflte[12] = dflte[2] * 256 + dflte[3] dfgsm[5] = dfgsm[5] / (1024 * 1024 * 1024) dfwcdma[19] = dfwcdma[10] + dfwcdma[11] dfwcdma[19] = dfwcdma[19] / (1024 * 1024 * 1024) varssnamelte1 = dfnamelist[dfnamelist[6].str.contains(u'LTE')] varssnamelte = varssnamelte1.ix[:, [8]] varssnamewcdma = dfnamelist[dfnamelist[6].str.contains(u'WCDMA')] varssnamegsm = dfnamelist[dfnamelist[6].str.contains(u'GSM')] varssnamewcdmagsm1 = varssnamewcdma.append(varssnamegsm, ignore_index=True) varssnamewcdmagsm = varssnamewcdmagsm1.ix[:, [7, 8]] varssnamewcdmagsm.columns = ['LAC', 'CI'] varssnamelte.columns = ['cellid'] varssnamelte = varssnamelte.drop_duplicates(['cellid']) dflte.columns = [ 'city', 'changjia', 'ENODB ID', 'id', 'cellid1', 'data', 'time', 'shang', 'xia', 'PRB', 'rrc', 'ltetraffic', 'cellid' ] dfgsm.columns = [ 'time', 'nodbname', 'LAC', 'CI', 'huawu', 'gsmtraffic', 'city', 'PRB' ] dfwcdma.columns = [ '0', 'city', '2', '3', 'LAC', 'CI', '6', '7', '8', 'huawu', '10', '11', '12', '13', '14', '15', '16', '17', 'PRB', 'wcdmatraffic' ] dflteschool = merge(dflte, varssnamelte, how='inner', on='cellid') dfgsmschool = merge(dfgsm, varssnamewcdmagsm, how='inner', on=['LAC', 'CI']) dfwcdmaschool = merge(dfwcdma, varssnamewcdmagsm, how='inner', on=['LAC', 'CI']) # dflteschool_zero = dflteschool[dflteschool['ltetraffic'] == 0] dfgsmschool_zero = dfgsmschool[dfgsmschool['huawu'] == 0] dfwcdmaschool_zero = dfwcdmaschool[dfwcdmaschool['huawu'] == 0] dfwcdmaschool_zero = dfwcdmaschool_zero[dfwcdmaschool_zero['wcdmatraffic'] == 0] pivot_tablelte1_zero = pivot_table(dflte, index=['cellid1', 'city'], values=['ltetraffic'], aggfunc=[np.sum, len]) dflteschool_zero = pivot_tablelte1_zero[pivot_tablelte1_zero.icol(0) == 0] dflteschool_zero = dflteschool_zero[dflteschool_zero.icol(1) == 24] dflteschool_zero['ltetraffic'] = dflteschool_zero.icol(1) / 24 dflteschool_zero.index.names = ['key1', 'city'] # print dflteschool_zero['ltetraffic'].sum(level='city') pivot_tablelte = pivot_table(dflte, index=['city'], values=['ltetraffic', 'PRB'], aggfunc=[np.sum, np.mean, len]) pivot_tablewcdma = pivot_table(dfwcdma, index=['city'], values=['wcdmatraffic', 'PRB'], aggfunc=[np.sum, np.mean, len]) pivot_tablegsm = pivot_table(dfgsm, index=['city'], values=['gsmtraffic', 'PRB'], aggfunc=[np.sum, np.mean, len]) pivot_tableschoollte = pivot_table(dflteschool, index=['city'], values=['ltetraffic', 'PRB'], aggfunc=[np.sum, np.mean, len]) pivot_tableschoolwcdma = pivot_table(dfwcdmaschool, index=['city'], values=['wcdmatraffic', 'PRB'], aggfunc=[np.sum, np.mean, len]) pivot_tableschoolgsm = pivot_table(dfgsmschool, index=['city'], values=['gsmtraffic', 'PRB'], aggfunc=[np.sum, np.mean, len]) tempcell = DataFrame([ pivot_tablelte.iloc[:, 1], pivot_tablewcdma.iloc[:, 1], pivot_tablegsm.iloc[:, 1] ]) tempcell = DataFrame(tempcell.T, index=[ u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港', u'百色', u'北海', u'玉林', u'钦州', u'贺州', u'来宾', u'防城港', u'河池' ]) tempcell[4] = tempcell.icol(0) + tempcell.icol(1) + tempcell.icol(2) tempschool = DataFrame([ pivot_tableschoollte.iloc[:, 1], pivot_tableschoolwcdma.iloc[:, 1], pivot_tableschoolgsm.iloc[:, 1] ], ) tempschool = DataFrame(tempschool.T, index=[ u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港', u'百色', u'北海', u'玉林', u'钦州', u'贺州', u'来宾', u'防城港', u'河池' ]) tempschool[4] = tempschool.icol(0) + tempschool.icol(1) + tempschool.icol( 2) tempschoolprb = DataFrame([ pivot_tableschoollte.iloc[:, 2] / (0.9 * 0.6 * 0.5 * 0.01), pivot_tableschoolwcdma.iloc[:, 2], pivot_tableschoolgsm.iloc[:, 2] ], ) tempschoolprb = DataFrame(tempschoolprb.T, index=[ u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港', u'百色', u'北海', u'玉林', u'钦州', u'贺州', u'来宾', u'防城港', u'河池' ]) tempschool_zero = DataFrame( dflteschool_zero['ltetraffic'].sum(level='city'), index=[ u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港', u'百色', u'北海', u'玉林', u'钦州', u'贺州', u'来宾', u'防城港', u'河池' ]) end = clock() print str((end - start) / 60) + 'mins' print u'完成统计' start = clock() with ExcelWriter('result_alltest%s.xlsx' % data1) as writer: # varssnamelte.to_excel(writer, sheet_name=u'lte列表', encoding="gbk") # dflteschool.to_excel(writer, sheet_name=u'lte学校列表',encoding="gbk") # Statistical_celllte.to_excel(writer, sheet_name=u'lte按地市流量统计', encoding="gbk") # Statistical_cell_schoollte.to_excel(writer, sheet_name=u'lte校园按地市流量统计', encoding="gbk") tempcell.to_excel(writer, sheet_name=u'按地市流量汇总', encoding="gbk") tempschool.to_excel(writer, sheet_name=u'按校园地市流量汇总', encoding="gbk") tempschoolprb.to_excel(writer, sheet_name=u'按校园地市资源利用率汇总', encoding="gbk") tempschool_zero.to_excel(writer, sheet_name=u'零话务小区汇总', encoding="gbk") pivot_tablelte.to_excel(writer, sheet_name=u'lte按地市透视图', encoding="gbk") pivot_tableschoollte.to_excel(writer, sheet_name=u'lte校园按地市透视图', encoding="gbk") dflteschool.to_excel(writer, sheet_name=u'lte校园按地市清单', encoding="gbk") # Statistical_cellgsm.to_excel(writer, sheet_name=u'gsm按地市流量统计', encoding="gbk") # Statistical_cell_schoolgsm.to_excel(writer, sheet_name=u'gsm校园按地市流量统计', encoding="gbk") pivot_tablegsm.to_excel(writer, sheet_name=u'gsm按地市透视图', encoding="gbk") pivot_tableschoolgsm.to_excel(writer, sheet_name=u'gsm校园按地市透视图', encoding="gbk") # Statistical_cellwcdma.to_excel(writer, sheet_name=u'wcdma按地市流量统计', encoding="gbk") # Statistical_cell_schoolwcdma.to_excel(writer, sheet_name=u'wcdma校园按地市流量统计', encoding="gbk") pivot_tablewcdma.to_excel(writer, sheet_name=u'wcdma按地市透视图', encoding="gbk") pivot_tableschoolwcdma.to_excel(writer, sheet_name=u'wcdma校园按地市透视图', encoding="gbk") dflteschool_zero.to_excel(writer, sheet_name=u'lte校园零话务小区清单', encoding="gbk") dfwcdmaschool_zero.to_excel(writer, sheet_name=u'wcdma校园零话务小区清单', encoding="gbk") dfgsmschool_zero.to_excel(writer, sheet_name=u'gsm校园零话务小区清单', encoding="gbk") print str((end - start) / 60) + 'mins' print u'完成输出文档'
data.w # 选择表格中的'w'列,使用点属性,返回的是Series类型 data[['w']] # 选择表格中的'w'列,返回的是DataFrame属性 data[['w', 'z']] # 选择表格中的'w'、'z'列 data[0:2] # 返回第1行到第2行的所有行,前闭后开,包括前不包括后 data[1:2] # 返回第2行,从0计,返回的是单行,通过有前后值的索引形式, # 如果采用data[1]则报错 data.ix[1:2] # 返回第2行的第三种方法,返回的是DataFrame,跟data[1:2]同 data['a':'b'] # 利用index值进行切片,返回的是**前闭后闭**的DataFrame, # 即末端是包含的 data.irow(0) # 取data的第一行 data.icol(0) # 取data的第一列 data.head() # 返回data的前几行数据,默认为前五行,需要前十行则dta.head(10) data.tail() # 返回data的后几行数据,默认为后五行,需要后十行则data.tail(10) ser.iget_value(0) # 选取ser序列中的第一个 ser.iget_value(-1) # 选取ser序列中的最后一个,这种轴索引包含索引器的series不能采用ser[-1]去获取最后一个,这回引起歧义。 data.iloc[-1] # 选取DataFrame最后一行,返回的是Series data.iloc[-1:] # 选取DataFrame最后一行,返回的是DataFrame data.loc['a', ['w', 'x']] # 返回‘a’行'w'、'x'列,这种用于选取行索引列索引已知 data.iat[1, 1] # 选取第二行第二列,用于已知行、列位置的选取。
index=[2,0,1] ) frame ''' 0 1 2 0 1 0 2 3 1 4 5 ''' frame.irow(0) # get row 0 (index==2), get values of all its cols ''' 0 0 1 1 Name: 2, dtype: int64 ''' frame.irow(2) # get row 2 (index==1), get vals of all its cols ''' 0 4 1 5 Name: 1, dtype: int64 ''' frame.icol(1) # in this case, ==frame.icol(-1) ''' 2 1 0 3 1 5 Name: 1, dtype: int64 '''
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
#%% d[0] # error #%% d['a'] # Series,列 #%% d[['a','c']] # DataFrame,列 #%% d[:5] # DataFrame,行 #%% d.ix[:5] # position-based,行 #%% d1.ix[:5] # label-based,行 #%% d.irow(0) # Series #%% d.icol(0) # Series #%% d.get_value('e','a') # get_value(row_name,col_name) #%% 强制使用位置来访问元素的方法 d.iget_value(0,1) # iget_value(irow,icol) #%% 使用条件过滤 d[d>5] #%% d[d.a>5] #%% d[(d>5)&(d%3==0)] #%% 使用条件过滤的本质 d>5 # DataFrame #%%
class GradeBook(object): """A class encapsulating a pandas DataFrame and meant to store the grades for a whole class. It provides the method compute_total_grades that compute the totla grade for each student according to a weights provided by the caller. """ def __init__(self, grade_arr, student_ids, item_list, max_scores): """ Constructor of the class grade frame: It should set the following attributes: (1) self.raw_grades, which is a DataFrame with - row labels given by student_ids - column labels given by item_list - values given by grade_arr (2) self.total_grades, set to None (3) self.letter_grades, set to None (4) self.max_scores, set to max_scores Parameters ---------- grade_arr : numpy array of grades as returned by simulate_grades student_ids: a list of student ids item_list: a list of grade items (e.g. ['HW', 'M', 'F']) max_scores: a list of the maximum possible score for each grade item Returns ------- nothing Examples -------- >>> a = GradeBook(array([[1,2],[3,4]]),['22','34'],['F','M'],[30, 50]) >>> a.letter_grades == None True >>> a.total_grades == None True >>> a.raw_grades.shape == (2,2) True >>> a.raw_grades.ix[0,0] == 1 True >>> a.max_scores[0] == 30 True """ self.raw_grades = DataFrame(grade_arr, index=student_ids, columns=item_list) self.total_grades = None self.letter_grades = None self.max_scores = max_scores def compute_total_grades(self, item_weights=None, max_score=100): """ Compute student total class grades as a weighted average of the column in self.raw_grades according to the weights passed to item_weight for each of the columns. The student total class grades are then stored in the Series attribute self.total_grades The return value should be a Series containing a numerical summary (as returned by the Series method describe) of the total class grade distribution. Parameters ---------- item_weights: list of floats summing up to one List of weights to be applied to each grade item (e.g. [0.3, 0.4, 0.3]) max_score: float Maximal possible score for the total class grade Returns ------- out : Series A Series containing a numerical summary of the total grade distribution previously stored by the function in the attribute self.total_grades; this Series is the output of the Series method describe. ---- Examples -------- >>> a = GradeBook(array([[5,5],[1,1]]),['22','34'],['F','M'],[10, 10]) >>> b = a.compute_total_grades([0.5, 0.5], 100) >>> len(b) == 5 False >>> a.total_grades['22'] == 50 True >>> a.total_grades['34'] == 10 True """ self.total_grades = 0 for i in range(0, len(self.raw_grades.columns)-1): self.total_grades += item_weights[i] * self.raw_grades.icol(i) return self.total_grades.describe()
train_y = pd.read_csv('../train_knn_1/train_y' + day_time + '.csv') test_x = pd.read_csv('../test_knn_1/test_x' + day_time + '.csv') test_y = pd.read_csv('../test_knn_1/test_y' + day_time + '.csv') scaler = preprocessing.MinMaxScaler() train_x = scaler.fit_transform(train_x) test_x = scaler.transform(test_x) KNN = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto', p=1) Bagging_KNN = BaggingRegressor(base_estimator=KNN, n_estimators=100, random_state=1, max_features=0.2) #Boosting_KNN = AdaBoostRegressor(base_estimator=KNN,n_estimators=100,random_state=1) pre = DataFrame() for i in range(7): Bagging_KNN.fit(train_x, list(train_y.icol(i).values)) pre['col_' + str(i)] = (Bagging_KNN.predict(test_x)).round() tmp_score = calculate_score(pre.icol(i).values, test_y.icol(i).values) print str(i) + ': ', tmp_score score = calculate_score(pre.values, test_y.values) print score