def parse_sw_history2(begin_date='2014-03-12', end_date=None, code='801150'): if end_date is None: now = arrow.now() end_date = str(now.date()) condition = 'swindexcode=\'{}\' and BargainDate>=\'{}\' and BargainDate<=\'{}\' and type=\'Day\'' where = condition.format(code, begin_date, end_date) all_data = [] for index in range(1, 1000): payload = {'tablename':'V_Report', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 993, 'timed': 1456667319778 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') print(data) result = json.loads(data) data_list = result.get('root') # print 'url****'+url # print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) # print df # print df.info() # print df.describe() # print df['PE'] # print df[df['BargainDate'] == '2015-10-16 0:00:00'] if 'PE' not in df: return # clean data with empty PE or PB df = df[df['PE'] != ''] df = df[df['PB'] != ''] # convert string to datetime(timestamp) df['BargainDate'] = pd.to_datetime(df['BargainDate']) # convert string to float df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) print(df) # df_sort_pe = df.sort(columns='PE', ascending=True) df_sort_pe = df.sort_values(by='PE', ascending=True) # print df_sort_pe # df_sort_pb = df.sort(columns='PB', ascending=True) df_sort_pb = df.sort_values(by='PB', ascending=True) # print df_sort_pb # print 'PE mean:{}'.format(df['PE'].mean()) # print 'PB mean:{}'.format(df['PB'].mean()) # print 'PB<1:{}'.format(df[df.PB < 1]) return df
def parse_sw_history2(begin_date='2014-03-12', end_date=None, code='801150'): if end_date is None: now = arrow.now() end_date = str(now.date()) condition = 'swindexcode=\'{}\' and BargainDate>=\'{}\' and BargainDate<=\'{}\' and type=\'Day\'' where = condition.format(code, begin_date, end_date) all_data = [] for index in range(1, 1000): payload = {'tablename':'V_Report', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 1, 'timed': 1456667319778 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') # print data result = json.loads(data) data_list = result.get('root') # print 'url****'+url # print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) # print df # print df.info() # print df.describe() # print df['PE'] # print df[df['BargainDate'] == '2015-10-16 0:00:00'] # clean data with empty PE or PB df = df[df['PE'] != ''] df = df[df['PB'] != ''] # convert string to datetime(timestamp) df['BargainDate'] = pd.to_datetime(df['BargainDate']) # convert string to float df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) print(df) # df_sort_pe = df.sort(columns='PE', ascending=True) df_sort_pe = df.sort_values(by='PE', ascending=True) # print df_sort_pe # df_sort_pb = df.sort(columns='PB', ascending=True) df_sort_pb = df.sort_values(by='PB', ascending=True) # print df_sort_pb # print 'PE mean:{}'.format(df['PE'].mean()) # print 'PB mean:{}'.format(df['PB'].mean()) # print 'PB<1:{}'.format(df[df.PB < 1]) return df
def main(): mpl.use("Agg") def read_ssv(fname): lines = [line.split() for line in open(fname, 'r')] if args.format.lower() == 'galago_eval': return lines elif args.format.lower() == 'trec_eval': return [[line[1], line[0]] + line[2:] for line in lines] def readNumQueries(run): tsv = read_ssv(run) data = [int(row[2]) for row in tsv if row[0] == "all" and row[1] == numQueries_key] return data[0] def findQueriesWithNanValues(run): tsv = read_ssv(run) # print ("tsv,", tsv) queriesWithNan = {row[0] for row in tsv if row[1] == 'num_rel' and (float(row[2]) == 0.0 or math.isnan(float(row[2])))} return queriesWithNan def fetchValues(run): tsv = read_ssv(run) data = {row[0]: float(row[2]) for row in tsv if row[1] == args.metric and not math.isnan(float(row[2]))} return data args = parser.parse_args() pairedt = pairedttest.pairedt(best=True, format=args.format, metric=args.metric, runs=args.runs) print("paired t") print(pairedt) print("=-----=") numQueries_key = "num_q" print("column.py metric="+args.metric+" out="+args.out) datas = {run: fetchValues(run) for run in args.runs} # deal with nans queriesWithNanValues = {'all'}.union(*[findQueriesWithNanValues(run) for run in args.runs]) basedata=datas[args.runs[0]] queries = set(basedata.keys()).difference(queriesWithNanValues) numQueries = readNumQueries(args.runs[0]) if args.c else len(queries) seriesDict = {'mean':dict(), 'stderr':dict()} for run in datas: data = datas[run] if sum(not key in data for key in queries) > 0: print("data for run "+run+" does not contain all queries "+" ".join(queries)) mean = np.sum([data.get(key, 0.0) for key in queries]) / numQueries stderr = np.std([data.get(key, 0.0) for key in queries] + ([0.0]* (numQueries - len(queries)))) / sqrt(numQueries) seriesDict['mean'][run]=mean seriesDict['stderr'][run]=stderr print( "dropping queries because of NaN values: "+ " ".join(queriesWithNanValues)) print ('\t'.join(['run', 'mean/stderr'])) for run in datas: #if not run == args.runs[0]: print ('\t'.join([run, str(seriesDict['mean'][run]), str(seriesDict['stderr'][run])])) df1 = DataFrame(seriesDict, index=pd.Index(args.runs)) if args.sort: df1.sort_values('mean',ascending=False,inplace=True) df2 = df1['mean'] df2.index=[os.path.basename(label) for label in df1.index] df1.index=[os.path.basename(label) for label in df1.index] print('df2.index=',df2.index) df2.text=['**' if (math.isnan(pairedt[label][1]) or pairedt[label][1]>0.05) else '' for label in df2.index] min_same_idx = max( [i if (math.isnan(pairedt[label][1]) or pairedt[label][1]>0.05) else 0 for i,label in enumerate(df2.index)]) cs = {k:v for k,v in zip(sorted(list(set([label[0:3] for label in df1.index]))), itertools.cycle(['0.1','0.9','0.5','0.3','0.7','0.2','0.8', '0.4','0.6'])) } df1['color']=[cs[label[0:3]] for label in df1.index] print(df1['color']) plt.tick_params(colors=df1.color) fig, ax = plt.subplots() df2.plot.bar(yerr = df1['stderr'], color=df1.color.values, ax=ax) for (p, i) in zip(ax.patches,range(100)): if args.sort : if i==min_same_idx: frompoint=(p.get_x()+p.get_width(), p.get_height()/2.0) topoint=(0.0-p.get_width()/2.0, p.get_height()/2.0) ax.annotate("", xy=topoint, xycoords='data', xytext=frompoint, textcoords='data', arrowprops=dict(arrowstyle="<|-|>", connectionstyle="arc3", ec='r'), ) else: ax.annotate(df2.text[i], xy=(p.get_x() + p.get_width() / 2.0, p.get_height()*0.9), ha='center', va='center',) ax.grid() plt.ylabel(args.metric, fontsize=20) plt.tick_params(axis='both', which='major', labelsize=20) plt.xticks(rotation=90) plt.savefig(args.out, bbox_inches='tight')