Example #1
0
def pd_04():
    obj=Series(range(4),index=['d','a','b','c'])
    print obj
    print obj.sort_index()
    frame=DataFrame(np.arange(8).reshape(2,4),index=['three','one'],columns=['d','a','b','c'])
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(ascending=False)
    obj1=Series([4,7,-3,2])
    print obj1.order()
    print frame.sort_index(by='b')
    print frame.sort_index(by=['a','b'])
    print frame.describe()
Example #2
0
    def __init__(self, df, column,
                 n):  # gets the most frecuent words in a document

        texto = " ".join(str(x) for x in df[column].values)
        tokens = texto.split()
        tokens = [x.lower() for x in tokens]
        #stopset = set(stopwords.words('english')) # dictionary of stop words
        #tokens = [w for w in tokens if not w in stopset]
        stemmer = SnowballStemmer("english")
        stemm_words = []
        tokens_clean = []
        for j in tokens:

            sa = re.sub('[^A-Za-z]+', '', j)
            tokens_clean.append(sa)
        #print tokens_clean
        for s in tokens_clean:
            try:
                stem = stemmer.stem(s)
                if s != '':
                    stemm_words.append(str(stem))
            except:
                pass
        cuenta = len(tokens_clean)
        largo = Counter(stemm_words).most_common(n)
        topdic = dict(largo)
        asortado = Series(topdic)
        asortadol = asortado.columns = ['a', 'b']
        ordenado = asortado.order(ascending=False)
        ordenadolist = topdic.keys()  #+stemm_words
        self.top = ordenadolist
 def __init__(self,df, column,n ): # gets the most frecuent words in a document
   
     texto = " ".join(str(x) for x in df[column].values)
     tokens = texto.split()
     tokens=[x.lower() for x in tokens]
     #stopset = set(stopwords.words('english')) # dictionary of stop words
     #tokens = [w for w in tokens if not w in stopset]
     stemmer=SnowballStemmer("english")
     stemm_words=[]
     tokens_clean=[]
     for j in tokens:
       
       sa=re.sub('[^A-Za-z]+', '', j)
       tokens_clean.append(sa)
     #print tokens_clean
     for s in tokens_clean:
       try:
         stem= stemmer.stem(s)
         if s!='':
          stemm_words.append(str(stem)) 
       except:
         pass
     cuenta = len(tokens_clean)
     largo =  Counter(stemm_words).most_common(n)
     topdic = dict(largo)
     asortado = Series(topdic)
     asortadol = asortado.columns = ['a', 'b']
     ordenado = asortado.order(ascending=False)
     ordenadolist= topdic.keys() #+stemm_words
     self.top=ordenadolist
Example #4
0
File: hjm.py Project: alpmdog/CQF
def calculate_pca(forwards, no_factors=3):
    fwddiff = forwards.diff()
    fwddiff = fwddiff.dropna()
    covmat = fwddiff.cov()
    covmat = covmat * 252 / 10000
    eigenvecs, eigenmat = jacobi(covmat.values)
    eigvecs = Series(eigenvecs, index=covmat.columns)
    sorted_eigvecs = eigvecs.order(ascending=False)
    top3 = sorted_eigvecs[:no_factors].index
    eigenmat_df = DataFrame(eigenmat, index=covmat.columns,
                            columns=covmat.columns)
    filtered_eigenmat = eigenmat_df.filter(top3)
    return sorted_eigvecs, filtered_eigenmat
Example #5
0
def calculate_pca(forwards, no_factors=3):
    fwddiff = forwards.diff()
    fwddiff = fwddiff.dropna()
    covmat = fwddiff.cov()
    covmat = covmat * 252 / 10000
    eigenvecs, eigenmat = jacobi(covmat.values)
    eigvecs = Series(eigenvecs, index=covmat.columns)
    sorted_eigvecs = eigvecs.order(ascending=False)
    top3 = sorted_eigvecs[:no_factors].index
    eigenmat_df = DataFrame(eigenmat,
                            index=covmat.columns,
                            columns=covmat.columns)
    filtered_eigenmat = eigenmat_df.filter(top3)
    return sorted_eigvecs, filtered_eigenmat
Example #6
0
    def test_order(self):
        ts = self.ts.copy()
        ts[:5] = np.NaN
        vals = ts.values

        result = ts.order()
        self.assert_(np.isnan(result[-5:]).all())
        self.assert_(np.array_equal(result[:-5], np.sort(vals[5:])))

        result = ts.order(na_last=False)
        self.assert_(np.isnan(result[:5]).all())
        self.assert_(np.array_equal(result[5:], np.sort(vals[5:])))

        # something object-type
        ser = Series(['A', 'B'], [1, 2])
        # no failure
        ser.order()

        # ascending=False
        ordered = ts.order(ascending=False)
        expected = np.sort(ts.valid().values)[::-1]
        assert_almost_equal(expected, ordered.valid().values)
        ordered = ts.order(ascending=False, na_last=False)
        assert_almost_equal(expected, ordered.valid().values)
Example #7
0
    return ([x.max(),x.min()],index=['max','min'])
df1.apply(f)

format11 = lambda x: '%.2f' % x
df1.applymap(format11)
df1['c'].map(format11)

# sorting
se1 = Series(np.arange(4),index=list('cadb'))
se1.sort_index()

df1 = DataFrame(np.arange(8).reshape((2,4)), index=['three','one'], columns=list('cdab'))
df1.sort_index(axis=1, ascending=False)

se2 = Series([-1, 6, np,nan, 3, -7])
se2.order()

df2 = DataFrame({'b':[1, 6, 3, -7], 'a':[2, 0, 4, -1]})
df2.sort_index(by='b')
df2.sort_index(by=['a','b']) # sort independently

se3 = Series([1, 6, 4, 3, 8, 7, 1])
se3.rank() # break same rank by 'arange avg ranking'
se3.rank(method='first')
se3.rank(method='min')
se3.rank(method='average')
se3.rank(method='max', ascending=False)

# 6. repeated index
se1.index.is_unique
                       数据排序
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#%%
from pandas import Series,DataFrame
from string import letters
s1=Series(arange(10,dtype=np.float64),index=list(letters[:10])[::-1])
s1[5]=np.nan
d1=DataFrame(
    {'a':range(10),'b':repeat([0,1],5),'c':repeat([0,1,2,3,4],2)},
    index=list(letters[:10])
)
d2=d1.set_index([d1.index,d1['b']])
d2.index.names=['r0','r1']

#%%
s1.order()
#%%
s1.order(ascending=False)
#%%
s1.order(na_last=False)
#%%
s1.sort_index()

#%% 对于数据框没有order这个方法
d1.order()     # error
#%% 按索引排序
d1.sort_index()
#%% 按字段排序
d1.sort(['b', 'c'], ascending=[1, 0])
#%%
d1.sort(['c', 'b'], ascending=[1, 0])
Example #9
0
#Sorting and ranking

obj = Series(range(4), index=['d', 'a', 'b', 'c'])
print(obj.sort_index())

frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])

print(frame.sort_index())
print(frame.sort_index(axis=1))

print(frame.sort_index(axis=1, ascending=False))

obj = Series([4, 7, -3, 2])
print(obj.order())

obj = Series([4, np.nan, 7, np.nan, -3, 2])
print(obj.order())

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

print(frame)
print(frame.sort_index(by='b'))

print(frame.sort_index(by=['a', 'b']))

obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())

print(obj.rank(method='first'))
Example #10
0
    n = 1
    while True:
        f += n
        yield f
        n += 1

# <codecell>

# hint list(islice([0,1,2,3,4], 2))  is [0,1]

f = triangle()
list(islice(f,4))

# <codecell>

s  = Series(arange(3), index = ['a','b','c'])
s['b']

# <codecell>

s.order()[::-1][0]

# <codecell>

df = DataFrame([{'name':'UC Berkeley', 'city':'Berkeley', 'state':'CA'}, {'name':'MIT', 'city':'Cambridge', 'state':'MA'},
                {'name':'Stanford', 'city':'Stanford', 'state':'CA'}, {'name':'Harvard', 'city':'Cambridge', 'state':'MA'}])
sum(df[df.state == 'CA'].name.str.len())

# put into words what we're trying to calculate and give  the answer

Example #11
0
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
ser1 = Series(range(3), index=['C', 'A', 'B'])
ser2 = ser1.sort_index()
print(ser2)
print(ser1.order())
from numpy.random import randn
ser3 = Series(randn(10))
print(ser3.rank())
print(ser3.sort())
Example #12
0
# sorting
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

# this works just like you would expect with dataframe
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])

frame.sort_index()
frame.sort_index(axis=1)
frame.sort_index(axis=1, ascending=False)

# now shift gears and sort by values
obj = Series([4, 7, -3, 2])
obj.order()  # error
obj.sort_values()

obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
frame.sort_index(by='b')  # from old version of book
frame.sort_values(by='b')  # you can make this a list if you like

# ranking
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

obj.rank(method='first')
'''

# The data is sorted in ascending order by default, but can be sorted in descending order, too:
print(frame.sort_index(axis = 1, ascending = False))
'''
       d  c  b  a
three  0  3  2  1
one    4  7  6  5

[2 rows x 4 columns]
'''


# To sort a Series by its values, use its order method:
obj = Series([4, 7, -3, 2])
print(obj.order())
'''
2   -3
3    2
0    4
1    7
dtype: int64
'''


'''
Ranking is closely related to sorting, assigning ranks from one through the number of
valid data points in an array. It is similar to the indirect sort indices produced by
numpy.argsort , except that ties are broken according to a rule. The rank methods for
Series and DataFrame are the place to look; by default rank breaks ties by assigning
each group the mean rank:
Example #14
0
pd.read_csv('d:data/ex5.csv', na_values=sentinels)

# 逐行读取文本文件
result = pd.read_csv('d:data/ex6.csv')
result
pd.read_csv('d:data/ex6.csv', nrows=5)
chunker = pd.read_csv('d:data/ex6.csv', chunksize=1000)
chunker

chunker = pd.read_csv('d:data/ex6.csv', chunksize=1000)

tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.order(ascending=False)

tot[:10]

# 文件写出
data = pd.read_csv('d:data/ex5.csv')
data
data.to_csv('d:data/out.csv')

data.to_csv(sys.stdout, sep='|')

data.to_csv(sys.stdout, na_rep='NULL')

data.to_csv(sys.stdout, index=False, header=False)

data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
                   index=['three', 'one'],
                   columns=['d', 'a', 'b', 'c'])
frame5.sort_index()

# In[123]:

frame5.sort_index(axis=0)

# In[122]:

frame5.sort_index(axis=1, ascending=False)

# In[124]:

obj4 = Series([4, 7, -3, 2])
obj4.order()

# In[131]:

obj5 = Series([4, np.nan, 7, np.nan, -3, 2])
#obj5.sort_index()
obj5.order()

# In[132]:

frame6 = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame6

# In[135]:

frame6.sort_index(by='b')
def main():
    out_dir = os.path.dirname(__file__)

    ex1_path = study.DATA_DIR + '/ch06/ex1.csv'
    cat(ex1_path)

    df = pd.read_csv(ex1_path)
    p(df)
    p(pd.read_table(ex1_path, sep=','))

    p('header less---------------------')
    ex2_path = study.DATA_DIR + '/ch06/ex2.csv'
    cat(ex2_path)
    names = ['a','b', 'c', 'd', 'message']
    p(pd.read_csv(ex2_path, header=None))
    p(pd.read_csv(ex2_path, names=names))
    p(pd.read_csv(ex2_path, names=names, index_col='message'))

    p('hierarchy index---------------------')
    mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv'
    cat(mindex_path)
    p(pd.read_csv(mindex_path, index_col=['key1', 'key2']))

    p('separate by regex-------------')
    ex3_path = study.DATA_DIR + '/ch06/ex3.csv'
    cat(ex3_path)
    p(pd.read_csv(ex3_path, sep='\s+'))

    p('skip rows-----------')
    ex4_path = study.DATA_DIR + '/ch06/ex4.csv'
    cat(ex4_path)
    p(pd.read_csv(ex4_path, skiprows=[0,2,3]))

    p('N/A------------------')
    ex5_path = study.DATA_DIR + '/ch06/ex5.csv'
    cat(ex5_path)
    result = pd.read_csv(ex5_path)
    p(result)
    p(pd.isnull(result))
    result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA
    p(result)

    p('N/A dict------------------')
    sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
    p(sentinels)
    p(pd.read_csv(ex5_path, na_values=sentinels))

    p('6.1.1 read data chunk size---------------------')
    ex6_path = study.DATA_DIR + '/ch06/ex6.csv'
    p(pd.read_csv(ex6_path).count())
    p(pd.read_csv(ex6_path, nrows=5))
    chunker = pd.read_csv(ex6_path, chunksize=1000)
    p(chunker)
    tot = Series([])
    for piece in chunker:
        tot = tot.add(piece['key'].value_counts(), fill_value=0)
    tot.order(ascending=False)
    p(tot[:10])

    p('6.1.2 write---------------------')
    data = pd.read_csv(ex5_path)
    p(data)

    ex5_out_path = out_dir + '/ex5_out.csv'
    data.to_csv(ex5_out_path)
    cat(ex5_path)

    data.to_csv(sys.stdout, index=False, header=False)
    print ''
    data.to_csv(sys.stdout, index=False, cols=list('abc'))
    print ''

    p('Series--------------')
    tseries_out_path = out_dir + '/tseries_out.csv'
    dates = pd.date_range('1/1/2000', periods=7)
    ts = Series(np.arange(7), index=dates)
    ts.to_csv(tseries_out_path)
    cat(tseries_out_path)
    p(Series.from_csv(tseries_out_path, parse_dates=True))

    p('6.1.3 csv-------------------------')
    ex7_path = study.DATA_DIR + '/ch06/ex7.csv'
    cat(ex7_path)
    f = open(ex7_path)
    reader = csv.reader(f)
    for line in reader:
        print line
    lines = list(csv.reader(open(ex7_path)))
    header, values = lines[0], lines[1:]
    data_dict = {h: v for h,v in zip(header, zip(*values))}
    p(data_dict)

    my_data_out_path = out_dir + '/mydata.csv'
    with open(my_data_out_path, 'w') as fp:
        writer = csv.writer(fp, dialect=my_dialect)
        writer.writerow(('one', 'two', 'three'))
        writer.writerow(('1', '2', '3'))
        writer.writerow(('4', '5', '6'))
        writer.writerow(('7', '8', '9'))
    cat(my_data_out_path)

    p('6.1.4 JSON-------------------------')
    obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
             {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
    result = json.loads(obj)
    p(result)
    asjson = json.dumps(result)
    p(asjson)
    siblings = DataFrame(result['siblings'], columns=['name', 'age'])
    p(siblings)

    p('6.1.4 XML/HTML Web Scraping-------------------------')
    url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options'
    if not url is '':
        parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
        doc = parsed.getroot()
        p([lnk.get('href') for lnk in doc.findall('.//a')][-10:])

        tables = doc.findall('.//table')
        p(parse_options_data(tables[9])[:5])
        p(parse_options_data(tables[13])[:5])

    p('6.1.5 Read XML-------------------------')
    xml_path = out_dir + '/Performance_MNR.xml'
    xml_content ="""
<INDICATOR>
    <INDICATOR_SEQ>373889</INDICATOR_SEQ>
    <PARENT_SEQ></PARENT_SEQ>
    <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME>
    <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
    <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
    <PERIOD_YEAR>2011</PERIOD_YEAR>
    <PERIOD_MONTH>12</PERIOD_MONTH>
    <CATEGORY>Service Indicators</CATEGORY>
    <FREQUENCY>M</FREQUENCY>
    <DESIRED_CHANGE>U</DESIRED_CHANGE>
    <INDICATOR_UNIT>%</INDICATOR_UNIT>
    <DECIMAL_PLACES>1</DECIMAL_PLACES>
    <YTD_TARGET>97.00</YTD_TARGET>
    <YTD_ACTUAL></YTD_ACTUAL>
    <MONTHLY_TARGET>97.00</MONTHLY_TARGET>
    <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
"""
    if not os.path.exists(xml_path):
        with open(xml_path, 'w') as f:
            f.write(xml_content)
    parsed = objectify.parse(open(xml_path))
    root = parsed.getroot()
    data = []
    skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
                   'DESIRED_SEQ', 'DECIMAL_PLACES']
    p(dir(root))
    for elt in root: # .INDICATOR:
        el_data = {}
        for child in elt.getchildren():
            if child.tag in skip_fields:
                continue
            el_data[child.tag] = child.pyval
        data.append(el_data)
    perf = DataFrame(data)
    p(perf)

    tag = '<a href="http://google.com">Google</a>'
    root = objectify.parse(StringIO.StringIO(tag)).getroot()
    p(root)
    p(root.get('href'))
    p(root.text)
Example #18
0
                                       
df1
df2
df1+df2
df1.add(df2,fill_value = 0) # B, SF doesn't exist in either

ser3 = df2.ix[0]
ser3
df2-ser3

##### Ranking and Sorting
#sort_index(), order()
ser1 = Series(range(3),index=['c','a','b'])
ser1
ser1.sort_index() # order by Index (asc)
ser1.order() # order by value (asc)
ser1.order(ascending=False) # order by value (dec)

#rank()
from numpy.random import randn
ser2 = Series(randn(10))
ser2
ser2.rank()
# sort put a series in order of it's item ranks

##### Summary
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
arr
df1 = DataFrame(arr,index = ['a','b'],columns = ['one','two','three'])
df1
result=pd.read_csv('ch06/ex6.csv')


result=pd.read_csv('ch06/ex6.csv',nrows=5)

#逐块读取文件

chunker=pd.read_csv('ch06/ex6.csv',chunksize=1000)

tot=Series([])

for piece in chunker:
    tot=tot.add(piece['key'].value_counts(),fill_value=0)

#提取key 列 按计数顺序 排序
tot=tot.order(ascending=False)

tot[:10]

#******************************************************
#写出 文本 格式

#读入
data=pd.read_csv('ch06/ex5.csv')

#写出 数据 以逗号分开
data.to_csv('out.csv')

#加 指定 分隔符'|',  并未 输出 实际文件  而是 打印
data.to_csv(sys.stdout,sep='|')
#空值  填  NULL
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from numpy.random import randn

# Sort by index
ser1 = Series(range(3), index=['C', 'A', 'B'])
print ser1
print ser1.sort_index()

# Sort by value
print ser1.order()

# Rank
ser2 = Series(randn(10))
print ser2
print ser2.rank()  # Get the rank without actually sorting
print ser2.order()  # Sort
Example #21
0
###															###
###			   		DATAFRAME OPERATIONS					###
###															###
###															###
###############################################################

# go to http://pandas.pydata.org/pandas-docs/stable/cookbook.html for several examples

df3 + df4 #adds dataframes
df4.add(df3,fill_value=0) # does the same thing, and replaces NaNs with 0

ser3 = df3.ix[0] # forming a series from a dataframe. Here the first row is returned as axis

ser3.sort_index() # sorts according to index

ser5 = ser4.order() # sorts according to value, but is NOT in place

ser4.sort() ## in place sorting 

df1.sum() #sum columns

df1.sum(axis = 1) # sum rows

df1.min() # minimum values across columns

df1.idxmin() #index of the minimum values

df1.cumsum() # returns dataframe with cumulative sums across columns

df1.describe() # returns summary stats across columns
Example #22
0
format = lambda x: '%.2f' % x
frame.applymap(format)

## 排序和排名
obj = Series(range(4),index=['d','a','b','c'])
obj. ()
frame = DataFrame(np.arange(8).reshape((2,4)), index=['three','one'],
	columns=['d','a','b','c'])
frame.sort_index()
# 按列名排序
frame.sort_index(axis=1)
# 降序排
frame.sort_index(axis=1, ascending=False)
# 对值进行排序,这个只能对Series使用
obj = Series([4,7,-3,2])
obj.order()
# 排序时缺失值都会被放在末尾
# 对多列进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame.sort_index(by=['a','b'])
frame.order(by=['a','b'])
# 排名
obj = Series([7,-5,7,4,2,0,4])
obj.rank()
# 对于相同值,按照出现次序排
obj.rank(method='first')
# 降序
obj.rank(ascending=False,method='max')
# 对列计算排名
frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame.rank(axis=1)
three  0  1  2  3
one    4  5  6  7
>>> frame.sort_index()
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
>>> frame.sort_index(axis=1)
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
>>> frame.sort_index(axis=1,ascending=False)
       d  c  b  a
three  0  3  2  1
one    4  7  6  5
>>> obj = Series([4, 7, -3, 2])
>>> obj.order()

Warning (from warnings module):
  File "__main__", line 1
FutureWarning: order is deprecated, use sort_values(...)
2   -3
3    2
0    4
1    7
dtype: int64
>>> df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
>>> df
    one  two
Example #24
0
# Lecture 21 - Rank and Sort

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

ser1 = Series(range(3), index = ['C','A','B'])
ser1

# use sort index to sort by index
ser1.sort_index()

# use order to sort by values
ser1.order()

from numpy.random import randn
ser2 = Series(randn(10))
ser2

# ranking
ser2.sort_values()

ser2.rank()

ser2.sort_values(ascending = False)

ser3 = Series(randn(10))
ser3

ser3.rank()
ser3 = ser3.sort_values()
Example #25
0
def practice_one():
    obj = Series([4, 7, -5, 3])
    '''
    pandas解析函数
        read_csv        从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为逗号
        read_table      从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为制表符
        read_fwf        读取定宽列格式数据(没有分隔符)
        read_clipboard  读取剪贴板中的数据,可以看作read_table的剪贴板
    '''
    '''
    read_csv/read_table函数的参数:
        path            表示文件系统位置,URL,文件型对象的字符串
        sep,delimiter   用于对行中个字段进行拆分的字符序列或正则表达式
        header          用作列名的行号。默认0(第一行),若无则设置为None
        index_col       用作行索引的列编号或列名
        names           用于结果的列名列表
        skiprows        需要忽略的行数(从文件开始处算起),或需要跳过的行号列表(从0开始)
        na_values       一组用于替换NA的值
        comment         用于将注释信息从行尾拆分出去的字符(一或多)
        parse_dates     将数据解析为日期,默认False;若为True,则尝试解析所有列。此外,还可以指定需要的一组列号或列名
        keep_data_col   如果连接多列解析日期,则保持参与连接的列。默认False
        converters      由列名/列名跟函数之间的映射关系组成的字典
        dayfirst        当解析有歧义的日期时,将其看做国际格式
        data_parser     用于解析日期的函数
        nrows           需要读取的行数
        iterator        返回一个TextParser以便逐块读取文件
        chunksize       文件块的大小(用于迭代)
        skip_footer     需要忽略的行数
        verbose         打印各种解析器输出信息
        encoding        用于unicode的文件编码格式
        squeeze         如果数据经解析后仅含一列,则返回Series
        thousands       千分位分隔符,如‘,’或‘。’
    '''

    # 逐块读取文本文件
    '''
    文件夹:ch06    文件名:ex6.csv
    '''
    # 在处理文件时若只想读取一小部分或对文件进行迭代
    pd.read_csv('ch06/ex6.csv')
    # 只想读取几行,通过nrows进行指定即可
    pd.read_csv('ch06/ex6.csv', nrows=5)
    # 逐块读取文件,需要设置chunksize(行数), 返回TextParser对象
    chunker = pd.read_csv('ch06/ex6.csv', chunksize=10)
    tot = Series([])
    for piece in chunker:
        tot = tot.add(piece['message'].value_counts, fill_value=0)
        # 聚合到message列
    tot = tot.order(ascending=False)

    # 将数据写出到文本格式
    data = pd.read_csv('ch06/ex5csv')
    data.to_csv('ch06/out.csv')  # 将数据写入一个以逗号分隔的文件中
    data.to_csv(sys.stdout, sep='|')  # 分隔符为|
    data.to_csv(sys.stdout, na_rep='NULL')  # 缺失值表示为空字符串
    data.to_csv(sys.stdout, index=False, header=False)
    data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c'])

    # 手工处理分隔符格式
    import csv
    f = open('ch06/ex7.csv')
    reader = csv.reader(f)
    for line in reader:
        print(line)

    lines = list(csv.reader(open('ch06/ex7.csv')))
    header, values = lines[0], lines[1:]  # 分段
    data_dict = {h: v for h, v in zip(header, zip(*values))}

    # 定义csv.Dialect的一个子类,关于格式的
    class my_dialect(csv.Dialect):
        lineterminator = '\n'
        delimiter = ';'
        quotechar = '"'

    reader = csv.reader(f, dialect=my_dialect)
    reader = csv.reader(f, dialect='|')  # 不定义子类,直接提供
    '''
    csv.Dialect的属性及功能
        delimiter       用于分隔字段的单字符字符串,默认','
        lineterminator  用于写操作的行结束,默认'\r\n'
        quotechar       用于带有特殊字符的字段的引用符号,默认'"'
        quoting         引用约定。可选值包括csv.QUOTE_ALL(引用所有字段),
                        csv.QUOTE_MINIMAL(只引用带有如分隔符之类特殊字符的字段),
                        csv.QUOTE_NONNUMERIC以及csv.QUOTE_NON(不引用),默认QUOTE_MINIMAL
        skipinitialspace    忽略分隔符后面的空白符,默认False
        doublequote     处理字段内的引用符号。True,则双写
        escapechar      用于对分隔符进行转义的字符串,默认禁用
    '''
    with open('mydata.csv', 'w') as f:
        writer = csv.writer(f, dialect=my_dialect)
        writer.writerow(('one', 'two', 'three'))
        writer.writerow(('1', '2', '3'))
        writer.writerow(('4', '5', '6'))
        writer.writerow(('7', '8', '9'))

    # JSON数据
    obj = """
    {"name": "Wes",
     "places_lived": ["United States", "Spain", "Germany"],
     "pet": null,
     "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
                    {"name": "Katie", "age": 33, "pet": "Cisco"}]
    }
    """
    import json
    result = json.loads(obj)  # 将JSON对象转换为python格式
    json.dumps(result)  # 将python对象转换为JSON格式
    siblings = DataFrame(result['siblings'],
                         columns=['name', 'age'])  # 将JSON对象转换为DataFrame

    # XML和HTML:Web信息收集
    from lxml.html import parse
    from urllib2 import urlopen  # 无法下载urllib2类
    parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
    doc = parsed.getroot()
    links = doc.findall('.//a')  # 查询
    links[28].get('href')  # 获得url
    links[28].text_content()  # 获得文本
    urls = [links[28].get('href') for lnk in doc.findall('.//a')]  # 获得文档中全部URL

    tables = doc.findall('.//table')
    calls = tables[9]
    puts = tables[13]
    rows = calls.findall('.//tr')

    def _unpack(row, kind='td'):
        elts = row.findall('.//%s' % kind)
        return [val.text_content() for val in elts]

    _unpack(rows[1], kind='th')
    _unpack(rows[1], kind='td')
    from pandas.io.parsers import TextParser

    def parse_options_data(table):
        rows = table.findall('.//tr')
        header = _unpack(rows[0], kind='th')
        data = [_unpack(r) for r in rows[1:]]
        return TextParser(data, names=header).get_chunk()

    parse_options_data(calls)
    parse_options_data(puts)

    pass

# -----------------------------------------------------------------------------
# Closer look at the data
# -----------------------------------------------------------------------------


# Some stats about Entropy gain, importance, Hamtropy, etc.

# Entropy gain
from rg_toolbox_math import entropy_gain
#eg = Series([entropy_gain(data_mlnd[feat],y) for feat in feat_mlnd_nm], index=feat_mlnd_nm)
eg = Series([entropy_gain(X_train[feat],y_train_bol) for feat in X_train.columns], index=feat_mlnd_nm)
#n, Ix = hist_discr(X_train['school_GP'])
#entropy_gain(X_train['school_GP'],y_train_bol)
egs = eg.order(ascending=False)
ixs_eg = egs.index

import matplotlib.pyplot as pl
ind = np.arange(len(egs))
fig = pl.figure(1, figsize=(9, 4))
ax = fig.add_subplot(111)
ax.bar(ind,egs)
pl.xticks(ind, ixs_eg, rotation=90)
ax.set_title('Entropy Gain')

# Using a tree for feature importance
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)
clf.fit(X_train, y_train)
z = Series(clf.feature_importances_, X_train.columns)
Example #27
0
# three  0  1  2  3
#列排序
print frame.sort_index(axis=1)
#        a  b  c  d
# three  1  2  3  0
# one    5  6  7  4

#数据默认是升序排序, 也可以指定降序排序
print frame.sort_index(axis=1, ascending=False)
#        d  c  b  a
# three  0  3  2  1
# one    4  7  6  5

#按值对Series进行排序,使用order方法 ,
obj = Series([4, 7, -3, 2])
print obj.order()
# 2   -3
# 3    2
# 0    4
# 1    7
#排序时,任何缺失值默认都会被放到Series的末尾
obj = Series([4, np.nan, 7, np.nan, -3, 2])
print obj.order()
# 4    -3
# 5     2
# 0     4
# 2     7
# 1   NaN
# 3   NaN

#df上,通过一列或多列的值进行排序