Example #1
0
def main():

    ser = Series(np.arange(3.))
    ser2 = Series(np.arange(3.), index=list('abc'))
    print ser
    print ser2
    print '',''
    # print ser[-1]
    print ser2[-1]
    print '',''
    print ser.ix[:1]

    print '',''
    ser3 = Series(range(3), index=[-5, 1, 3])
    print ser3
    print '',''
    print ser3.iget_value(2)
    print '',''
    frame = DataFrame(np.arange(6).reshape((3, 2)), index=[2, 0, 1])
    print frame
    print '',''
    print frame.irow(0)

    # panel
    # u'3次元のデータフレーム/パネルの各項目(列)はデータフレーム
    print '','----------------------'
    lst = ['AAPL', 'MSFT'] # , 'DELL', 'GOOG'
    pdata = pd.Panel(dict((stk, pd.io.data.get_data_yahoo(stk, '1/1/2009', '6/1/2012'))
                           for stk in lst))
    if not pdata.empty:
        print pdata
        pdata = pdata.swapaxes('items', 'minor')
        print '',''
        print pdata['Adj Close']
        print '',''
        print pdata.ix[:, '6/1/2012', :]
        print '',''
        print pdata.ix['Adj Close', '5/22/2012', :]
        print '', ''
        print type(pdata.ix[:, '5/30/2012', :]) # DataFrame
        if hasattr(pdata.ix[:, '5/30/2012', :], 'to_frame'):
            stacked = pdata.ix[:, '5/30/2012', :].to_frame()
            print stacked
            print '',''
            print stacked.to_panel()
        if hasattr(pdata, 'to_frame'):
            f1 = pdata.to_frame()
            print f1
            print '',''
            print f1.to_panel()
            print '',''
Example #2
0
def getComptes(year):
	urlBase = 'http://alize2.finances.gouv.fr/communes/eneuro/detail.php?icom=056&dep=075&type=BPS&param=5&exercice='
	urlToCook = urlBase + str(year)
	soup = getSoupFromUrl(urlToCook)
	colEur_p_hab = soup.select("body > table:nth-of-type(3) tr > td:nth-of-type(2)")
	colMoy_d_str = soup.select("body > table:nth-of-type(3) tr > td:nth-of-type(3)")
	numEur_p_hab = []
	numMoy_d_str = []

	rowst = 3
	rowskip = 2  # careful: there are 2 table headers that are not tr
	for c1 in colEur_p_hab[rowst:]:
		if c1.text!='':
			numEur_p_hab.append(int(c1.text.replace(' ', '')) )
	for c2 in colMoy_d_str[rowst:]:	
		if c2.text!='':
			numMoy_d_str.append(int(c2.text.replace(' ', '')) )

	data = DataFrame({'Eur/hab': numEur_p_hab,'Moy strate':numMoy_d_str})
	rowIds=[i-(rowst+rowskip) for i in [5,9,16,21]]
	print "Résultats consolidés pour la ville de Paris (exercice "+str(year)+")"
	print data.irow(rowIds)
Example #3
0
ser2[-1]
ser.ix[:1]
ser3=Series(range(3),index=[-5,1,3])
ser3.iget_value(2)
ser3.iloc(2)
ser3.iloc[i]
ser3.iloc[2]
ser3.iat[2]
ser3
ser3.iloc[1]
ser3.iloc[0]
ser3.iloc[3]
ser3.iraw(0)
frame = DataFrame(np.arange(6).reshape(3,2)),index=[2,0,1])
frame = DataFrame(np.arange(6).reshape((3,2)),index=[2,0,1])
frame.irow(0)
frame.iloc[0]
frame
frame.iloc[3]
frame.iloc[2]
pdata = pd.Panel(dict(stk, data.get_data_google(stk,'1/1/2009','6/1/2012')) for stk in ['AAPL','GOOG','MSFT','DELL']))
pdata = pd.Panel(dict((stk, data.get_data_google(stk,'1/1/2009','6/1/2012')) for stk in ['AAPL','GOOG','MSFT','DELL']))
from pandas_datareader import data
pdata = pd.Panel(dict((stk, data.get_data_google(stk,'1/1/2009','6/1/2012')) for stk in ['AAPL','GOOG','MSFT','DELL']))
pdata
pdata = pdata.swapaxes('items','minor')
pdata['Close']
pdata.ix[:,'6/1/2012',"]
pdata.ix[:,'6/1/2012',:]
pdata.ix['Close','5/22/2012':,:]
stacked = pdata.ix['Close','5/22/2012':,:].to_frame()
Example #4
0
print '字母索引可以直接使用-1来访问最后一个', ser2[-1]

# 如果轴索引含有索引器, 那么根据整数进行数据选取操作,是面向标签的,不是面向排序的
print '3是面向标签的,不是面向位置的', ser_neg.ix[:3]

# 可靠的,不考虑索引类型和基于位置的索引
# 使用Series的iget_value
ser3 = Series(range(3), index=[-5, 1, 3])
# 这个future可能会被取消,尽量使用等价的那个
print ser3.iget_value(2)
# 等价于
print ser3.iloc[2]
# 对于frame可以使用irow
frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])
# 下面这个future可能会被取消,所以尽量使用等价的那一个
print frame.irow(0)
print frame.iloc(0)

# 面板数据
# pandas有一个Panel数据结构, 可以理解为一个三维版本的DataFrame
pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk, '1/1/2009', '6/1/2012')) for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))
print 'Panel', pdata
# Panel的每一项都是DataFrame
# 交换2个轴
pdata = pdata.swapaxes('items', 'minor')
print pdata['Adj Close']
# 基于ix的标签索引被推广到三个维度
print pdata.ix[:, '6/1/2012', :]
print pdata.ix['Adj Close', '5/22/2012':, :]

# 呈现面板数据,尤其是面对拟合统计模型,使用"堆积式的"DataFrame形式
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
    {'a':range(0,10),'b':range(10,20),'c':range(20,30)}
)
#%%
d[0]          # error 
#%% 
d['a']        # Series,列
#%%
d[['a','c']]  # DataFrame,列
#%%
d[:5]         # DataFrame,行
#%% 
d.ix[:5]      # position-based,行
#%%
d1.ix[:5]     # label-based,行
#%%           
d.irow(0)     # Series
#%% 
d.icol(0)     # Series
#%%
d.get_value('e','a')    # get_value(row_name,col_name)
#%% 强制使用位置来访问元素的方法
d.iget_value(0,1)       # iget_value(irow,icol)  

#%% 使用条件过滤
d[d>5]
#%% 
d[d.a>5]
#%%
d[(d>5)&(d%3==0)]

#%% 使用条件过滤的本质
Example #7
0
def main():
    from pandas import DataFrame
    from vbench.api import BenchmarkRunner
    from vbench.db import BenchmarkDB
    from vbench.git import GitRepo
    from suite import REPO_PATH, BUILD, DB_PATH, PREPARE, dependencies, benchmarks

    # GitRepo wants exactly 7 character hash?
    args.base_commit = args.base_commit[:7]
    if args.target_commit:
        args.target_commit = args.target_commit[:7]

    if not args.log_file:
        args.log_file = os.path.abspath(
            os.path.join(REPO_PATH, 'vb_suite.log'))

    random.seed(args.seed)
    np.random.seed(args.seed)

    TMP_DIR = tempfile.mkdtemp()
    prprint("TMP_DIR = %s" % TMP_DIR)
    prprint("LOG_FILE = %s\n" % args.log_file)

    benchmarks = [x for x in benchmarks if re.search(args.regex,x.name)]

    try:
        logfile = open(args.log_file, 'w')

        prprint("Opening DB at '%s'...\n" % DB_PATH)
        db = BenchmarkDB(DB_PATH)

        prprint("Initializing Runner...")

        # all in a good cause...
        GitRepo._parse_commit_log = _parse_wrapper(args.base_commit)

        runner = BenchmarkRunner(
            benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH,
            TMP_DIR, PREPARE, always_clean=True,
            # run_option='eod', start_date=START_DATE,
            module_dependencies=dependencies)

        repo = runner.repo  # (steal the parsed git repo used by runner)

        # ARGH. reparse the repo, without discarding any commits,
        # then overwrite the previous parse results
        # prprint ("Slaughtering kittens..." )
        (repo.shas, repo.messages,
         repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH,
                                                                args.base_commit)

        h_head = args.target_commit or repo.shas[-1]
        h_baseline = args.base_commit

        prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, "")))
        prprint('Baseline [%s] : %s\n' % (h_baseline,
                repo.messages.get(h_baseline, "")))

        prprint("removing any previous measurements for the commits.")
        db.delete_rev_results(h_baseline)
        db.delete_rev_results(h_head)

        # TODO: we could skip this, but we need to make sure all
        # results are in the DB, which is a little tricky with
        # start dates and so on.
        prprint("Running benchmarks for baseline [%s]" % h_baseline)
        runner._run_and_write_results(h_baseline)

        prprint("Running benchmarks for target [%s]" % h_head)
        runner._run_and_write_results(h_head)

        prprint('Processing results...')

        head_res = get_results_df(db, h_head)
        baseline_res = get_results_df(db, h_baseline)
        ratio = head_res['timing'] / baseline_res['timing']
        totals = DataFrame(dict(t_head=head_res['timing'],
                                t_baseline=baseline_res['timing'],
                                ratio=ratio,
                                name=baseline_res.name), columns=["t_head", "t_baseline", "ratio", "name"])
        totals = totals.ix[totals.t_head > args.min_duration]
            # ignore below threshold
        totals = totals.dropna(
        ).sort("ratio").set_index('name')  # sort in ascending order


        hdr = ftr = """
-----------------------------------------------------------------------
Test name                      | target[ms] |  base[ms]  |   ratio    |
-----------------------------------------------------------------------
""".strip() +"\n"

        s = "\n"
        s += hdr
        for i in range(len(totals)):
            t,b,r = totals.irow(i).values
            s += "{0:30s} {1: 12.4f} {2: 12.4f} {3: 12.4f}\n".format(totals.index[i],t,b,r)
        s+= ftr + "\n"

        s += "Ratio < 1.0 means the target commit is faster then the baseline.\n"
        s += "Seed used: %d\n\n" % args.seed

        s += 'Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, ""))
        s += 'Base   [%s] : %s\n\n' % (
            h_baseline, repo.messages.get(h_baseline, ""))

        logfile.write(s)
        logfile.close()

        prprint(s)
        prprint("Results were also written to the logfile at '%s'\n" %
                args.log_file)

    finally:
        #        print("Disposing of TMP_DIR: %s" % TMP_DIR)
        shutil.rmtree(TMP_DIR)
        logfile.close()
Example #8
0
ser2 = Series(np.arange(3.), index=['a','b','c'])
print(ser2)
print('\n')
print(ser2[-1])
print('\n')
print(ser.ix[:1])
print('\n')

ser3 = Series(range(3), index=[-5,1,3])
print(ser3.iget_value(2))
print('\n')

###############################################################

frame = DataFrame(np.arange(6).reshape(3,2), index=[2,0,1])
print(frame.irow(0))