def pd_04(): obj=Series(range(4),index=['d','a','b','c']) print obj print obj.sort_index() frame=DataFrame(np.arange(8).reshape(2,4),index=['three','one'],columns=['d','a','b','c']) print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(ascending=False) obj1=Series([4,7,-3,2]) print obj1.order() print frame.sort_index(by='b') print frame.sort_index(by=['a','b']) print frame.describe()
def __init__(self, df, column, n): # gets the most frecuent words in a document texto = " ".join(str(x) for x in df[column].values) tokens = texto.split() tokens = [x.lower() for x in tokens] #stopset = set(stopwords.words('english')) # dictionary of stop words #tokens = [w for w in tokens if not w in stopset] stemmer = SnowballStemmer("english") stemm_words = [] tokens_clean = [] for j in tokens: sa = re.sub('[^A-Za-z]+', '', j) tokens_clean.append(sa) #print tokens_clean for s in tokens_clean: try: stem = stemmer.stem(s) if s != '': stemm_words.append(str(stem)) except: pass cuenta = len(tokens_clean) largo = Counter(stemm_words).most_common(n) topdic = dict(largo) asortado = Series(topdic) asortadol = asortado.columns = ['a', 'b'] ordenado = asortado.order(ascending=False) ordenadolist = topdic.keys() #+stemm_words self.top = ordenadolist
def __init__(self,df, column,n ): # gets the most frecuent words in a document texto = " ".join(str(x) for x in df[column].values) tokens = texto.split() tokens=[x.lower() for x in tokens] #stopset = set(stopwords.words('english')) # dictionary of stop words #tokens = [w for w in tokens if not w in stopset] stemmer=SnowballStemmer("english") stemm_words=[] tokens_clean=[] for j in tokens: sa=re.sub('[^A-Za-z]+', '', j) tokens_clean.append(sa) #print tokens_clean for s in tokens_clean: try: stem= stemmer.stem(s) if s!='': stemm_words.append(str(stem)) except: pass cuenta = len(tokens_clean) largo = Counter(stemm_words).most_common(n) topdic = dict(largo) asortado = Series(topdic) asortadol = asortado.columns = ['a', 'b'] ordenado = asortado.order(ascending=False) ordenadolist= topdic.keys() #+stemm_words self.top=ordenadolist
def calculate_pca(forwards, no_factors=3): fwddiff = forwards.diff() fwddiff = fwddiff.dropna() covmat = fwddiff.cov() covmat = covmat * 252 / 10000 eigenvecs, eigenmat = jacobi(covmat.values) eigvecs = Series(eigenvecs, index=covmat.columns) sorted_eigvecs = eigvecs.order(ascending=False) top3 = sorted_eigvecs[:no_factors].index eigenmat_df = DataFrame(eigenmat, index=covmat.columns, columns=covmat.columns) filtered_eigenmat = eigenmat_df.filter(top3) return sorted_eigvecs, filtered_eigenmat
def test_order(self): ts = self.ts.copy() ts[:5] = np.NaN vals = ts.values result = ts.order() self.assert_(np.isnan(result[-5:]).all()) self.assert_(np.array_equal(result[:-5], np.sort(vals[5:]))) result = ts.order(na_last=False) self.assert_(np.isnan(result[:5]).all()) self.assert_(np.array_equal(result[5:], np.sort(vals[5:]))) # something object-type ser = Series(['A', 'B'], [1, 2]) # no failure ser.order() # ascending=False ordered = ts.order(ascending=False) expected = np.sort(ts.valid().values)[::-1] assert_almost_equal(expected, ordered.valid().values) ordered = ts.order(ascending=False, na_last=False) assert_almost_equal(expected, ordered.valid().values)
return ([x.max(),x.min()],index=['max','min']) df1.apply(f) format11 = lambda x: '%.2f' % x df1.applymap(format11) df1['c'].map(format11) # sorting se1 = Series(np.arange(4),index=list('cadb')) se1.sort_index() df1 = DataFrame(np.arange(8).reshape((2,4)), index=['three','one'], columns=list('cdab')) df1.sort_index(axis=1, ascending=False) se2 = Series([-1, 6, np,nan, 3, -7]) se2.order() df2 = DataFrame({'b':[1, 6, 3, -7], 'a':[2, 0, 4, -1]}) df2.sort_index(by='b') df2.sort_index(by=['a','b']) # sort independently se3 = Series([1, 6, 4, 3, 8, 7, 1]) se3.rank() # break same rank by 'arange avg ranking' se3.rank(method='first') se3.rank(method='min') se3.rank(method='average') se3.rank(method='max', ascending=False) # 6. repeated index se1.index.is_unique
数据排序 """""""""""""""""""""""""""""""""""""""""""""""""""""""""" #%% from pandas import Series,DataFrame from string import letters s1=Series(arange(10,dtype=np.float64),index=list(letters[:10])[::-1]) s1[5]=np.nan d1=DataFrame( {'a':range(10),'b':repeat([0,1],5),'c':repeat([0,1,2,3,4],2)}, index=list(letters[:10]) ) d2=d1.set_index([d1.index,d1['b']]) d2.index.names=['r0','r1'] #%% s1.order() #%% s1.order(ascending=False) #%% s1.order(na_last=False) #%% s1.sort_index() #%% 对于数据框没有order这个方法 d1.order() # error #%% 按索引排序 d1.sort_index() #%% 按字段排序 d1.sort(['b', 'c'], ascending=[1, 0]) #%% d1.sort(['c', 'b'], ascending=[1, 0])
#Sorting and ranking obj = Series(range(4), index=['d', 'a', 'b', 'c']) print(obj.sort_index()) frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) print(frame.sort_index()) print(frame.sort_index(axis=1)) print(frame.sort_index(axis=1, ascending=False)) obj = Series([4, 7, -3, 2]) print(obj.order()) obj = Series([4, np.nan, 7, np.nan, -3, 2]) print(obj.order()) frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) print(frame) print(frame.sort_index(by='b')) print(frame.sort_index(by=['a', 'b'])) obj = Series([7, -5, 7, 4, 2, 0, 4]) print(obj.rank()) print(obj.rank(method='first'))
n = 1 while True: f += n yield f n += 1 # <codecell> # hint list(islice([0,1,2,3,4], 2)) is [0,1] f = triangle() list(islice(f,4)) # <codecell> s = Series(arange(3), index = ['a','b','c']) s['b'] # <codecell> s.order()[::-1][0] # <codecell> df = DataFrame([{'name':'UC Berkeley', 'city':'Berkeley', 'state':'CA'}, {'name':'MIT', 'city':'Cambridge', 'state':'MA'}, {'name':'Stanford', 'city':'Stanford', 'state':'CA'}, {'name':'Harvard', 'city':'Cambridge', 'state':'MA'}]) sum(df[df.state == 'CA'].name.str.len()) # put into words what we're trying to calculate and give the answer
import numpy as np import pandas as pd from pandas import Series, DataFrame ser1 = Series(range(3), index=['C', 'A', 'B']) ser2 = ser1.sort_index() print(ser2) print(ser1.order()) from numpy.random import randn ser3 = Series(randn(10)) print(ser3.rank()) print(ser3.sort())
# sorting obj = Series(range(4), index=['d', 'a', 'b', 'c']) obj.sort_index() # this works just like you would expect with dataframe frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame.sort_index() frame.sort_index(axis=1) frame.sort_index(axis=1, ascending=False) # now shift gears and sort by values obj = Series([4, 7, -3, 2]) obj.order() # error obj.sort_values() obj = Series([4, np.nan, 7, np.nan, -3, 2]) obj.sort_values() frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame frame.sort_index(by='b') # from old version of book frame.sort_values(by='b') # you can make this a list if you like # ranking obj = Series([7, -5, 7, 4, 2, 0, 4]) obj.rank() obj.rank(method='first')
''' # The data is sorted in ascending order by default, but can be sorted in descending order, too: print(frame.sort_index(axis = 1, ascending = False)) ''' d c b a three 0 3 2 1 one 4 7 6 5 [2 rows x 4 columns] ''' # To sort a Series by its values, use its order method: obj = Series([4, 7, -3, 2]) print(obj.order()) ''' 2 -3 3 2 0 4 1 7 dtype: int64 ''' ''' Ranking is closely related to sorting, assigning ranks from one through the number of valid data points in an array. It is similar to the indirect sort indices produced by numpy.argsort , except that ties are broken according to a rule. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:
pd.read_csv('d:data/ex5.csv', na_values=sentinels) # 逐行读取文本文件 result = pd.read_csv('d:data/ex6.csv') result pd.read_csv('d:data/ex6.csv', nrows=5) chunker = pd.read_csv('d:data/ex6.csv', chunksize=1000) chunker chunker = pd.read_csv('d:data/ex6.csv', chunksize=1000) tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) tot = tot.order(ascending=False) tot[:10] # 文件写出 data = pd.read_csv('d:data/ex5.csv') data data.to_csv('d:data/out.csv') data.to_csv(sys.stdout, sep='|') data.to_csv(sys.stdout, na_rep='NULL') data.to_csv(sys.stdout, index=False, header=False) data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame5.sort_index() # In[123]: frame5.sort_index(axis=0) # In[122]: frame5.sort_index(axis=1, ascending=False) # In[124]: obj4 = Series([4, 7, -3, 2]) obj4.order() # In[131]: obj5 = Series([4, np.nan, 7, np.nan, -3, 2]) #obj5.sort_index() obj5.order() # In[132]: frame6 = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame6 # In[135]: frame6.sort_index(by='b')
def main(): out_dir = os.path.dirname(__file__) ex1_path = study.DATA_DIR + '/ch06/ex1.csv' cat(ex1_path) df = pd.read_csv(ex1_path) p(df) p(pd.read_table(ex1_path, sep=',')) p('header less---------------------') ex2_path = study.DATA_DIR + '/ch06/ex2.csv' cat(ex2_path) names = ['a','b', 'c', 'd', 'message'] p(pd.read_csv(ex2_path, header=None)) p(pd.read_csv(ex2_path, names=names)) p(pd.read_csv(ex2_path, names=names, index_col='message')) p('hierarchy index---------------------') mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv' cat(mindex_path) p(pd.read_csv(mindex_path, index_col=['key1', 'key2'])) p('separate by regex-------------') ex3_path = study.DATA_DIR + '/ch06/ex3.csv' cat(ex3_path) p(pd.read_csv(ex3_path, sep='\s+')) p('skip rows-----------') ex4_path = study.DATA_DIR + '/ch06/ex4.csv' cat(ex4_path) p(pd.read_csv(ex4_path, skiprows=[0,2,3])) p('N/A------------------') ex5_path = study.DATA_DIR + '/ch06/ex5.csv' cat(ex5_path) result = pd.read_csv(ex5_path) p(result) p(pd.isnull(result)) result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA p(result) p('N/A dict------------------') sentinels = {'message': ['foo', 'NA'], 'something': ['two']} p(sentinels) p(pd.read_csv(ex5_path, na_values=sentinels)) p('6.1.1 read data chunk size---------------------') ex6_path = study.DATA_DIR + '/ch06/ex6.csv' p(pd.read_csv(ex6_path).count()) p(pd.read_csv(ex6_path, nrows=5)) chunker = pd.read_csv(ex6_path, chunksize=1000) p(chunker) tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) tot.order(ascending=False) p(tot[:10]) p('6.1.2 write---------------------') data = pd.read_csv(ex5_path) p(data) ex5_out_path = out_dir + '/ex5_out.csv' data.to_csv(ex5_out_path) cat(ex5_path) data.to_csv(sys.stdout, index=False, header=False) print '' data.to_csv(sys.stdout, index=False, cols=list('abc')) print '' p('Series--------------') tseries_out_path = out_dir + '/tseries_out.csv' dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv(tseries_out_path) cat(tseries_out_path) p(Series.from_csv(tseries_out_path, parse_dates=True)) p('6.1.3 csv-------------------------') ex7_path = study.DATA_DIR + '/ch06/ex7.csv' cat(ex7_path) f = open(ex7_path) reader = csv.reader(f) for line in reader: print line lines = list(csv.reader(open(ex7_path))) header, values = lines[0], lines[1:] data_dict = {h: v for h,v in zip(header, zip(*values))} p(data_dict) my_data_out_path = out_dir + '/mydata.csv' with open(my_data_out_path, 'w') as fp: writer = csv.writer(fp, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) cat(my_data_out_path) p('6.1.4 JSON-------------------------') obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ result = json.loads(obj) p(result) asjson = json.dumps(result) p(asjson) siblings = DataFrame(result['siblings'], columns=['name', 'age']) p(siblings) p('6.1.4 XML/HTML Web Scraping-------------------------') url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options' if not url is '': parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() p([lnk.get('href') for lnk in doc.findall('.//a')][-10:]) tables = doc.findall('.//table') p(parse_options_data(tables[9])[:5]) p(parse_options_data(tables[13])[:5]) p('6.1.5 Read XML-------------------------') xml_path = out_dir + '/Performance_MNR.xml' xml_content =""" <INDICATOR> <INDICATOR_SEQ>373889</INDICATOR_SEQ> <PARENT_SEQ></PARENT_SEQ> <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME> <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME> <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION> <PERIOD_YEAR>2011</PERIOD_YEAR> <PERIOD_MONTH>12</PERIOD_MONTH> <CATEGORY>Service Indicators</CATEGORY> <FREQUENCY>M</FREQUENCY> <DESIRED_CHANGE>U</DESIRED_CHANGE> <INDICATOR_UNIT>%</INDICATOR_UNIT> <DECIMAL_PLACES>1</DECIMAL_PLACES> <YTD_TARGET>97.00</YTD_TARGET> <YTD_ACTUAL></YTD_ACTUAL> <MONTHLY_TARGET>97.00</MONTHLY_TARGET> <MONTHLY_ACTUAL></MONTHLY_ACTUAL> </INDICATOR> """ if not os.path.exists(xml_path): with open(xml_path, 'w') as f: f.write(xml_content) parsed = objectify.parse(open(xml_path)) root = parsed.getroot() data = [] skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_SEQ', 'DECIMAL_PLACES'] p(dir(root)) for elt in root: # .INDICATOR: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data) perf = DataFrame(data) p(perf) tag = '<a href="http://google.com">Google</a>' root = objectify.parse(StringIO.StringIO(tag)).getroot() p(root) p(root.get('href')) p(root.text)
df1 df2 df1+df2 df1.add(df2,fill_value = 0) # B, SF doesn't exist in either ser3 = df2.ix[0] ser3 df2-ser3 ##### Ranking and Sorting #sort_index(), order() ser1 = Series(range(3),index=['c','a','b']) ser1 ser1.sort_index() # order by Index (asc) ser1.order() # order by value (asc) ser1.order(ascending=False) # order by value (dec) #rank() from numpy.random import randn ser2 = Series(randn(10)) ser2 ser2.rank() # sort put a series in order of it's item ranks ##### Summary arr = np.array([[1,2,np.nan],[np.nan,3,4]]) arr df1 = DataFrame(arr,index = ['a','b'],columns = ['one','two','three']) df1
result=pd.read_csv('ch06/ex6.csv') result=pd.read_csv('ch06/ex6.csv',nrows=5) #逐块读取文件 chunker=pd.read_csv('ch06/ex6.csv',chunksize=1000) tot=Series([]) for piece in chunker: tot=tot.add(piece['key'].value_counts(),fill_value=0) #提取key 列 按计数顺序 排序 tot=tot.order(ascending=False) tot[:10] #****************************************************** #写出 文本 格式 #读入 data=pd.read_csv('ch06/ex5.csv') #写出 数据 以逗号分开 data.to_csv('out.csv') #加 指定 分隔符'|', 并未 输出 实际文件 而是 打印 data.to_csv(sys.stdout,sep='|') #空值 填 NULL
import numpy as np import pandas as pd from pandas import Series, DataFrame from numpy.random import randn # Sort by index ser1 = Series(range(3), index=['C', 'A', 'B']) print ser1 print ser1.sort_index() # Sort by value print ser1.order() # Rank ser2 = Series(randn(10)) print ser2 print ser2.rank() # Get the rank without actually sorting print ser2.order() # Sort
### ### ### DATAFRAME OPERATIONS ### ### ### ### ### ############################################################### # go to http://pandas.pydata.org/pandas-docs/stable/cookbook.html for several examples df3 + df4 #adds dataframes df4.add(df3,fill_value=0) # does the same thing, and replaces NaNs with 0 ser3 = df3.ix[0] # forming a series from a dataframe. Here the first row is returned as axis ser3.sort_index() # sorts according to index ser5 = ser4.order() # sorts according to value, but is NOT in place ser4.sort() ## in place sorting df1.sum() #sum columns df1.sum(axis = 1) # sum rows df1.min() # minimum values across columns df1.idxmin() #index of the minimum values df1.cumsum() # returns dataframe with cumulative sums across columns df1.describe() # returns summary stats across columns
format = lambda x: '%.2f' % x frame.applymap(format) ## 排序和排名 obj = Series(range(4),index=['d','a','b','c']) obj. () frame = DataFrame(np.arange(8).reshape((2,4)), index=['three','one'], columns=['d','a','b','c']) frame.sort_index() # 按列名排序 frame.sort_index(axis=1) # 降序排 frame.sort_index(axis=1, ascending=False) # 对值进行排序,这个只能对Series使用 obj = Series([4,7,-3,2]) obj.order() # 排序时缺失值都会被放在末尾 # 对多列进行排序 frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]}) frame.sort_index(by=['a','b']) frame.order(by=['a','b']) # 排名 obj = Series([7,-5,7,4,2,0,4]) obj.rank() # 对于相同值,按照出现次序排 obj.rank(method='first') # 降序 obj.rank(ascending=False,method='max') # 对列计算排名 frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]}) frame.rank(axis=1)
three 0 1 2 3 one 4 5 6 7 >>> frame.sort_index() d a b c one 4 5 6 7 three 0 1 2 3 >>> frame.sort_index(axis=1) a b c d three 1 2 3 0 one 5 6 7 4 >>> frame.sort_index(axis=1,ascending=False) d c b a three 0 3 2 1 one 4 7 6 5 >>> obj = Series([4, 7, -3, 2]) >>> obj.order() Warning (from warnings module): File "__main__", line 1 FutureWarning: order is deprecated, use sort_values(...) 2 -3 3 2 0 4 1 7 dtype: int64 >>> df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) >>> df one two
# Lecture 21 - Rank and Sort import numpy as np import pandas as pd from pandas import Series, DataFrame ser1 = Series(range(3), index = ['C','A','B']) ser1 # use sort index to sort by index ser1.sort_index() # use order to sort by values ser1.order() from numpy.random import randn ser2 = Series(randn(10)) ser2 # ranking ser2.sort_values() ser2.rank() ser2.sort_values(ascending = False) ser3 = Series(randn(10)) ser3 ser3.rank() ser3 = ser3.sort_values()
def practice_one(): obj = Series([4, 7, -5, 3]) ''' pandas解析函数 read_csv 从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为逗号 read_table 从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为制表符 read_fwf 读取定宽列格式数据(没有分隔符) read_clipboard 读取剪贴板中的数据,可以看作read_table的剪贴板 ''' ''' read_csv/read_table函数的参数: path 表示文件系统位置,URL,文件型对象的字符串 sep,delimiter 用于对行中个字段进行拆分的字符序列或正则表达式 header 用作列名的行号。默认0(第一行),若无则设置为None index_col 用作行索引的列编号或列名 names 用于结果的列名列表 skiprows 需要忽略的行数(从文件开始处算起),或需要跳过的行号列表(从0开始) na_values 一组用于替换NA的值 comment 用于将注释信息从行尾拆分出去的字符(一或多) parse_dates 将数据解析为日期,默认False;若为True,则尝试解析所有列。此外,还可以指定需要的一组列号或列名 keep_data_col 如果连接多列解析日期,则保持参与连接的列。默认False converters 由列名/列名跟函数之间的映射关系组成的字典 dayfirst 当解析有歧义的日期时,将其看做国际格式 data_parser 用于解析日期的函数 nrows 需要读取的行数 iterator 返回一个TextParser以便逐块读取文件 chunksize 文件块的大小(用于迭代) skip_footer 需要忽略的行数 verbose 打印各种解析器输出信息 encoding 用于unicode的文件编码格式 squeeze 如果数据经解析后仅含一列,则返回Series thousands 千分位分隔符,如‘,’或‘。’ ''' # 逐块读取文本文件 ''' 文件夹:ch06 文件名:ex6.csv ''' # 在处理文件时若只想读取一小部分或对文件进行迭代 pd.read_csv('ch06/ex6.csv') # 只想读取几行,通过nrows进行指定即可 pd.read_csv('ch06/ex6.csv', nrows=5) # 逐块读取文件,需要设置chunksize(行数), 返回TextParser对象 chunker = pd.read_csv('ch06/ex6.csv', chunksize=10) tot = Series([]) for piece in chunker: tot = tot.add(piece['message'].value_counts, fill_value=0) # 聚合到message列 tot = tot.order(ascending=False) # 将数据写出到文本格式 data = pd.read_csv('ch06/ex5csv') data.to_csv('ch06/out.csv') # 将数据写入一个以逗号分隔的文件中 data.to_csv(sys.stdout, sep='|') # 分隔符为| data.to_csv(sys.stdout, na_rep='NULL') # 缺失值表示为空字符串 data.to_csv(sys.stdout, index=False, header=False) data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c']) # 手工处理分隔符格式 import csv f = open('ch06/ex7.csv') reader = csv.reader(f) for line in reader: print(line) lines = list(csv.reader(open('ch06/ex7.csv'))) header, values = lines[0], lines[1:] # 分段 data_dict = {h: v for h, v in zip(header, zip(*values))} # 定义csv.Dialect的一个子类,关于格式的 class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = ';' quotechar = '"' reader = csv.reader(f, dialect=my_dialect) reader = csv.reader(f, dialect='|') # 不定义子类,直接提供 ''' csv.Dialect的属性及功能 delimiter 用于分隔字段的单字符字符串,默认',' lineterminator 用于写操作的行结束,默认'\r\n' quotechar 用于带有特殊字符的字段的引用符号,默认'"' quoting 引用约定。可选值包括csv.QUOTE_ALL(引用所有字段), csv.QUOTE_MINIMAL(只引用带有如分隔符之类特殊字符的字段), csv.QUOTE_NONNUMERIC以及csv.QUOTE_NON(不引用),默认QUOTE_MINIMAL skipinitialspace 忽略分隔符后面的空白符,默认False doublequote 处理字段内的引用符号。True,则双写 escapechar 用于对分隔符进行转义的字符串,默认禁用 ''' with open('mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) # JSON数据 obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ import json result = json.loads(obj) # 将JSON对象转换为python格式 json.dumps(result) # 将python对象转换为JSON格式 siblings = DataFrame(result['siblings'], columns=['name', 'age']) # 将JSON对象转换为DataFrame # XML和HTML:Web信息收集 from lxml.html import parse from urllib2 import urlopen # 无法下载urllib2类 parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() links = doc.findall('.//a') # 查询 links[28].get('href') # 获得url links[28].text_content() # 获得文本 urls = [links[28].get('href') for lnk in doc.findall('.//a')] # 获得文档中全部URL tables = doc.findall('.//table') calls = tables[9] puts = tables[13] rows = calls.findall('.//tr') def _unpack(row, kind='td'): elts = row.findall('.//%s' % kind) return [val.text_content() for val in elts] _unpack(rows[1], kind='th') _unpack(rows[1], kind='td') from pandas.io.parsers import TextParser def parse_options_data(table): rows = table.findall('.//tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] return TextParser(data, names=header).get_chunk() parse_options_data(calls) parse_options_data(puts) pass
# ----------------------------------------------------------------------------- # Closer look at the data # ----------------------------------------------------------------------------- # Some stats about Entropy gain, importance, Hamtropy, etc. # Entropy gain from rg_toolbox_math import entropy_gain #eg = Series([entropy_gain(data_mlnd[feat],y) for feat in feat_mlnd_nm], index=feat_mlnd_nm) eg = Series([entropy_gain(X_train[feat],y_train_bol) for feat in X_train.columns], index=feat_mlnd_nm) #n, Ix = hist_discr(X_train['school_GP']) #entropy_gain(X_train['school_GP'],y_train_bol) egs = eg.order(ascending=False) ixs_eg = egs.index import matplotlib.pyplot as pl ind = np.arange(len(egs)) fig = pl.figure(1, figsize=(9, 4)) ax = fig.add_subplot(111) ax.bar(ind,egs) pl.xticks(ind, ixs_eg, rotation=90) ax.set_title('Entropy Gain') # Using a tree for feature importance from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(criterion='entropy', max_depth=3) clf.fit(X_train, y_train) z = Series(clf.feature_importances_, X_train.columns)
# three 0 1 2 3 #列排序 print frame.sort_index(axis=1) # a b c d # three 1 2 3 0 # one 5 6 7 4 #数据默认是升序排序, 也可以指定降序排序 print frame.sort_index(axis=1, ascending=False) # d c b a # three 0 3 2 1 # one 4 7 6 5 #按值对Series进行排序,使用order方法 , obj = Series([4, 7, -3, 2]) print obj.order() # 2 -3 # 3 2 # 0 4 # 1 7 #排序时,任何缺失值默认都会被放到Series的末尾 obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() # 4 -3 # 5 2 # 0 4 # 2 7 # 1 NaN # 3 NaN #df上,通过一列或多列的值进行排序