def xueqiu_history(code='600036', access_token=xq_a_token, begin_date=None, end_date=None): if begin_date is None: begin = arrow.get('2014-01-01') begin_date = begin.timestamp*1000 # print begin_date if end_date is None: end = arrow.now() end_date = end.timestamp*1000 if len(code) == 8: pass elif code.startswith('60') or code.startswith('51'): code = 'SH'+code elif len(code) == 5: code = 'HK'+code elif len(code) == 6: code = 'SZ'+code url = 'http://xueqiu.com/stock/forchartk/stocklist.json?symbol={}&period=1day&type=normal&begin={}&end={}&_=1443694358741' url = url.format(code, begin_date, end_date) payload = {'access_token': access_token} r = requests.get(url, params=payload, headers=headers) # print r.json() data_list = r.json().get('chartlist') # print data_list # print len(data_list) result = [] for data in data_list: # print data time = data.get('time') time = arrow.get(time, 'ddd MMM DD HH:mm:ss Z YYYY') # print time timestamp = time.timestamp*1000 history = StockHistory(code=code, percent=data.get('percent'), ma5=data.get('ma5'), ma10=data.get('ma10'), ma30=data.get('ma30'), open_price=data.get('open'), high=data.get('high'), low=data.get('low'), close=data.get('close'), time=time.datetime, timestamp=timestamp, volume=data.get('volume'), # 注:指数无法取得换手率 turn_rate=data.get('turnrate')) # print history result.append(history) df = DataFrame(data_list) # print df max_turnover = df['turnrate'].max() min_turnover = df['turnrate'].min() # print df['turnrate'].mean() # max_turnover_index = df.loc[df['turnrate'] == max_turnover].index # print max_turnover_index columns = ['time', 'turnrate', 'volume', 'close'] # print df.loc[df['turnrate'] == max_turnover][columns] # print df.loc[df['turnrate'] == min_turnover][columns] max_volume = df['volume'].max() min_volume = df['volume'].min() mean_volume = df['volume'].mean() # print df.loc[df['volume'] == max_volume][columns] # print df.loc[df['volume'] == min_volume][columns] return result
def parse_sw_history2(begin_date='2014-03-12', end_date=None, code='801150'): if end_date is None: now = arrow.now() end_date = str(now.date()) condition = 'swindexcode=\'{}\' and BargainDate>=\'{}\' and BargainDate<=\'{}\' and type=\'Day\'' where = condition.format(code, begin_date, end_date) all_data = [] for index in range(1, 1000): payload = {'tablename':'V_Report', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 1, 'timed': 1456667319778 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') # print data result = json.loads(data) data_list = result.get('root') # print 'url****'+url # print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) # print df # print df.info() # print df.describe() # print df['PE'] # print df[df['BargainDate'] == '2015-10-16 0:00:00'] # clean data with empty PE or PB df = df[df['PE'] != ''] df = df[df['PB'] != ''] # convert string to datetime(timestamp) df['BargainDate'] = pd.to_datetime(df['BargainDate']) # convert string to float df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) print(df) # df_sort_pe = df.sort(columns='PE', ascending=True) df_sort_pe = df.sort_values(by='PE', ascending=True) # print df_sort_pe # df_sort_pb = df.sort(columns='PB', ascending=True) df_sort_pb = df.sort_values(by='PB', ascending=True) # print df_sort_pb # print 'PE mean:{}'.format(df['PE'].mean()) # print 'PB mean:{}'.format(df['PB'].mean()) # print 'PB<1:{}'.format(df[df.PB < 1]) return df
def ah_premium_index(samples=[('600036', '03968'), ('600196', '02196'), ('601111', '00753')]): samples = [('600585', '00914'), ('601318', '02318'), ('000002', '02202'), ('600036', '03968'), ('600600', '00168'), ('600196', '02196'), ('600030', '06030'), ('600028', '00386'), ('601601', '02601'), ('601628', '02628'), ('000063', '00763'), ('601398', '01398'), ('601939', '00939'), ('601288', '01288'), ('600837', '06837'), ('601607', '02607'), ('600011', '00902'), ('002202', '02208'), ('601988', '03988'), ('601818', '06818'), ('601336', '01336'), ('600027', '01071'), ('601088', '01088'), ('601328', '03328'), ('600016', '01988'), ('601998', '00998'), ('601186', '01186'), ('600332', '00874'), ('601766', '01766'), ('002594', '01211'), ('601857', '00857'), ('000039', '02039'), ('600362', '00358'), ('600012', '00995'), ('601633', '02333'), ('601800', '01800'), ('601333', '00525'), ('601111', '00753'), ('600875', '01072'), ('601390', '00390'), ('601898', '01898'), ('601899', '02899'), ('000898', '00347'), ('000157', '01157'), ('600685', '00317'), ('601992', '02009'), ('601600', '02600'), ('601991', '00991'), ('600115', '00670'), ('601808', '02883'), ('600871', '01033'), ('601727', '02727'), ('600188', '01171'), ('601238', '02238'), ('601919', '01919'), ('601866', '02866'), ('601618', '01618'), ('600026', '01138'), ('601880', '02880'), ('600874', '01065'), ('600660', '03606'), ('600377', '00177'), ('000776', '01776'), ('601688', '06886'), ('000338', '02338'), ('600029', '01055'), ('603993', '03993'), ('601005', '01053'), ('600688', '00338'), ('600548', '00548'), ('002672', '00895'), ('000513', '01513'), ('000488', '01812'), ('601107', '00107'), ('601588', '00588'), ('600808', '00323'), ('000921', '00921'), ('600775', '00553'), ('600860', '00187'), ('000756', '00719'), ('601038', '00038'), ('600806', '00300'), ('002490', '00568'), ('002703', '01057'), ('600876', '01108'), ('601717', '00564'), ('000585', '00042')] a_list = [] h_list = [] price_a_list = [] price_h_list = [] ratio_list = [] hk_to_rmb = float(rmb_exchange_rate()[0])/100 for sample in samples: ratio = ah_ratio(hk_to_rmb, sample) if ratio: a_list.append(sample[0]) h_list.append(sample[1]) price_a_list.append(ratio.get('price_a')) price_h_list.append(ratio.get('price_h')) ratio_list.append(ratio.get('ratio')) df_dict = {'A': a_list, 'Price_A': price_a_list, 'H': h_list, 'Price_H': price_h_list, 'ratio': ratio_list} # print df_dict df = DataFrame(df_dict) # print df df = df.sort(columns='ratio', ascending=True) # print df # ah_index = np.mean(ratio_list) ah_index = df['ratio'].mean() # print 'ah_index:{}'.format(ah_index) # print 'discount stock:{}'.format(df[df.ratio < 1]) return AhIndex(ah_index)
def parse_sw_history(begin_date='2014-03-12', end_date=None, codes=None): if end_date is None: now = arrow.now() end_date = str(now.date()) if codes is None: codes = ('801010', '801020', '801030', '801040', '801050', '801060', '801070', '801080', '801090', '801100', '801110', '801120', '801130', '801140', '801150', '801160', '801170', '801180', '801190', '801200', '801210', '801220', '801230', '801710', '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880', '801890') condition = 'swindexcode in {} and BargainDate>=\'{}\' and BargainDate<=\'{}\'' where = condition.format(codes, begin_date, end_date) # print where all_data = [] for index in range(1, 1000): payload = {'tablename':'swindexhistory', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 28, 'timed': 1453385628267 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') result = json.loads(data) data_list = result.get('root') # print 'url****'+url # print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) # df['PE'] = df['PE'].astype(float) # df['PB'] = df['PB'].astype(float) # print '*'*20 # print len(df) # print df df = df.sort(columns='PE', ascending=True) # print df df = df.sort(columns='PB', ascending=True) # print df # print 'PE mean:{}'.format(df['PE'].mean()) # print 'PB mean:{}'.format(df['PB'].mean()) # print 'PB<1:{}'.format(df[df.PB < 1]) return df
def read_index2(code='000905'): url = 'http://www.csindex.com.cn/uploads/file/autofile/perf/{}perf.xls'.format( code) book = get_excel_book(url) # print(book) if code == '000300': name = '沪深300' elif code == '000905': name = '中证500' elif code == '000016': name = '上证50' for sheet in range(book.nsheets): sh = book.sheet_by_index(sheet) for rx in range(sh.nrows): row = sh.row(rx) df = DataFrame(row) # print(df) print(row) print(len(row)) if len(row) > 15: date = row[0].value pe1 = row[15].value pe2 = row[16].value dividend_yield_ratio1 = row[17].value dividend_yield_ratio2 = row[18].value turnover = row[13].value # # print(type(pe)) if date and pe1 and type(pe1) == float: py_date = xlrd.xldate.xldate_as_datetime( date, book.datemode) print(py_date) date = str(py_date) print(pd.to_datetime(date)) Index.objects(name=name, date=date).update_one( name=name, date=date, pe=pe1, pe_ttm=pe2, dividend_yield_ratio=dividend_yield_ratio1, turnover=turnover, upsert=True)
begin = blairInside['time'][0] end = blairInside['time'][-1] duration = end - begin steps = 300 slices = [(begin + duration * step / steps, begin + duration * (step + 2) / steps) for step in range(0, steps - 2)] slicedDates = [begin + (begin - end) / 2 for (begin, end) in slices ] # re-center date in middle of avg window idx = pandas.to_datetime(slicedDates, unit='s', utc=True) df1 = DataFrame( { 'inside': mean_std(blairInside, slices)[0], 'outside': mean_std(blairOutside, slices)[0] }, index=idx, columns=['inside', 'outside']) df1.plot(kind='line') plt.gca().xaxis.set_major_formatter( matplotlib.dates.DateFormatter('%H:%M', tz=timezone("America/New_York"))) def fill_error_tube(b, color): (mean, error) = mean_std(b, slices) plt.fill_between(df1.index, mean - error, mean + error, color=color) fill_error_tube(blairInside, [0.5, 0.5, 0.5, 0.5]) fill_error_tube(blairOutside, [0.5, 0.5, 0.5, 0.5])
def read_history(code='600036', begin_date=None, end_date=None): if begin_date is None: begin = arrow.get('2014-01-01') else: begin = arrow.get(begin_date) # print begin_date if end_date is None: end = arrow.now() else: end = arrow.get(end_date) code2 = code if len(code) == 8: pass elif code.startswith('60') or code.startswith('51'): code2 = 'SH' + code elif len(code) == 5: code2 = 'HK' + code elif len(code) == 6: code2 = 'SZ' + code # url = '{}/stock/forchartk/stocklist.json?symbol={}&period=1day&type=normal&begin={}&end={}&_=1443694358741' url = '{}/stock/forchartk/stocklist.json?symbol={}&period=1day&type=before&begin={}&end={}' url = url.format(api_home, code2, begin.timestamp * 1000, end.timestamp * 1000) # print(url) payload = {'access_token': xq_a_token} r = requests.get(url, params=payload, headers=headers) print(r.json()) data_list = r.json().get('chartlist') # print data_list # print len(data_list) result = [] for data in data_list: print(data) time = data.get('time') time = arrow.get(time, 'ddd MMM DD HH:mm:ss Z YYYY') date = time.format('YYYY-MM-DD') # print('date:{}'.format(date)) # timestamp = time.timestamp*1000 # history = StockHistory(code=code, percent=data.get('percent'), # ma5=data.get('ma5'), ma10=data.get('ma10'), ma30=data.get('ma30'), # open_price=data.get('open'), high=data.get('high'), low=data.get('low'), # close=data.get('close'), time=time.datetime, timestamp=timestamp, # volume=data.get('volume'), # # 注:指数无法取得换手率 # turn_rate=data.get('turnrate')) # print(Equity.objects(code=code, date=date)) Equity.objects(code=code, date=date).update_one(percent=data.get('percent'), open=data.get('open'), high=data.get('high'), low=data.get('low'), close=data.get('close'), volume=data.get('volume'), upsert=True) nh = False nl = False # if high == high52week: # nh = True # if low == low52week: # nl = True # Equity.objects(code=code, date=date).update_one(percent=data.get('percent'), # ma5=data.get('ma5'), ma10=data.get('ma10'), ma30=data.get('ma30'), # open_price=data.get('open'), high=data.get('high'), low=data.get('low'), # close=data.get('close'), time=time.datetime, timestamp=timestamp, # volume=data.get('volume'), # # 注:指数无法取得换手率 # turn_rate=data.get('turnrate'), upsert=True) # print history # result.append(history) df = DataFrame(data_list) # print df max_turnover = df['turnrate'].max() min_turnover = df['turnrate'].min() # print df['turnrate'].mean() # max_turnover_index = df.loc[df['turnrate'] == max_turnover].index # print max_turnover_index columns = ['time', 'turnrate', 'volume', 'close'] # print df.loc[df['turnrate'] == max_turnover][columns] # print df.loc[df['turnrate'] == min_turnover][columns] max_volume = df['volume'].max() min_volume = df['volume'].min() mean_volume = df['volume'].mean() # print df.loc[df['volume'] == max_volume][columns] # print df.loc[df['volume'] == min_volume][columns] return result
seriesDict = {key: dict() for key in queriesDiff} for run in datas: data = datas[run] mean = np.average([data[key] for key in queries]) stderr = np.std([data[key] for key in queries]) / sqrt(len(queries)) for (label, queriesByD) in queriesDiff.items(): seriesDict[label][run] = np.average( [data[key] for (key, x) in queriesByD]) print("dropping queries because of NaN values: " + " ".join(queriesWithNanValues)) df1 = DataFrame(seriesDict, columns=("0%-5%", "5%-25%", '25%-50%', '50%-75%', '75%-95%', '95%-100%'), index=args.runs) df2 = df1 df2.index = [os.path.basename(label) for label in df1.index] df3 = df2.transpose() plt.figure() df3.plot(kind='bar', label=args.metric, color=['0.0', '0.80', '0.4', '0.9', '0.70']) leg = plt.legend(loc='best', fancybox=True) leg.get_frame().set_alpha(0.5) plt.tick_params(axis='both', which='major', labelsize=11) plt.xticks(rotation=0) plt.ylabel(args.metric, fontsize=20)
mapping_doors = {'2': 0, '3': 1, '4': 2, '5more': 3} mapping_persons = {'2': 0, '4': 1, 'more': 2} mapping_lug = {'small': 0, 'med': 1, 'big': 2} mapping_safety = {'low': 0, 'med': 1, 'high': 2} mapping_class = {'unacc': 1, 'acc': 2, 'good': 3, 'vgood': 4} df['maint'] = df['maint'].map(mapping_buy_maint) df['buying'] = df['buying'].map(mapping_buy_maint) df['doors'] = df['doors'].map(mapping_doors) df['persons'] = df['persons'].map(mapping_persons) df['lug_boot'] = df['lug_boot'].map(mapping_lug) df['safety'] = df['safety'].map(mapping_safety) df['class'] = df['class'].map(mapping_class).astype(int) df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['class'].copy() features_df = df.copy() features_df = features_df.drop('class', axis=1) c45 = C45Constructor(cf=0.95) cart = CARTConstructor(max_depth=12, min_samples_leaf=2) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=10, alpha=0.99) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} for tree_constructor in tree_constructors:
def plot_output(name, infile_path, model_names, filter): """ Reads predictions from csv files and generates plots and output csv. Input csv files should be in the infile_path with following structure: ``infile_path`` / ../any_name/ ../config.csv, test_.csv,train_.csv ../any_name2 ../config.csv, test_.csv,train_.csv The function also exports the data used to generate graphs as csv files the following folder: ../graph_data these csv files can be used to reproduce outputs. Parameters ---------- name : string name of the csv files to which data will be exported infile_path : string the folder which contains csv for configs and test and train model_names : list name of the sub-directories in ``infile_path`` to consider filter : callable a filter which will be applied in config files to filter which configs should be considered. For example, lambda x: x['method'] == 'full' will only consider outputs which used 'full' method """ graphs = { 'SSE': {}, 'MSSE': {}, 'NLPD': {}, 'ER': {}, 'intensity': {}, } graph_n = {} for m in model_names: data_config = PlotOutput.read_config( infile_path + m + '/' + model_logging.CONFIG_FILE_NAME) if filter is None or filter(data_config): data_test = pandas.read_csv( infile_path + m + '/' + model_logging.PREDICTIONS_FILE_NAME) cols = data_test.columns dim = 0 for element in cols: if element.startswith('true_Y'): dim += 1 data_train = pandas.read_csv(infile_path + m + '/' + model_logging.TRAINING_FILE_NAME) Y_mean = data_train['Y_0'].mean() Ypred = np.array( [data_test['predicted_Y_%d' % (d)] for d in range(dim)]) Ytrue = np.array( [data_test['true_Y_%d' % (d)] for d in range(dim)]) Yvar = np.array([ data_test['predicted_variance_%d' % (d)] for d in range(dim) ]) if not (PlotOutput.config_to_str(data_config) in graph_n.keys()): graph_n[PlotOutput.config_to_str(data_config)] = 0 graph_n[PlotOutput.config_to_str(data_config)] += 1 if data_config['ll'] in [CogLL.__name__]: for i in range(Ytrue.shape[0]): Y_mean = data_train['Y_' + str(i)].mean() PlotOutput.add_to_list( graphs['MSSE'], PlotOutput.config_to_str(data_config) + '_' + str(i), ((Ypred[i] - Ytrue[i])**2).mean() / ((Y_mean - Ytrue[i])**2).mean()) NLPD = np.array(data_test['NLPD_' + str(i)]) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config) + '_' + str(i), NLPD) if data_config['ll'] in [ UnivariateGaussian.__name__, WarpLL.__name__ ]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list( graphs['SSE'], PlotOutput.config_to_str(data_config), (Ypred[0] - Ytrue[0])**2 / ((Y_mean - Ytrue[0])**2).mean()) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [LogisticLL.__name__]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list( graphs['ER'], PlotOutput.config_to_str(data_config), np.array([ (((Ypred[0] > 0.5) & (Ytrue[0] == -1)) | ((Ypred[0] < 0.5) & (Ytrue[0] == 1))).mean() ])) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [SoftmaxLL.__name__]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list( graphs['ER'], PlotOutput.config_to_str(data_config), np.array([(np.argmax(Ytrue, axis=0) != np.argmax( Ypred, axis=0)).mean()])) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [LogGaussianCox.__name__]: X0 = np.array([data_test['X_0']]) PlotOutput.add_to_list( graphs['intensity'], PlotOutput.config_to_str(data_config), np.array([ X0[0, :] / 365 + 1851.2026, Ypred[0, :], Yvar[0, :], Ytrue[0, :] ]).T) for n, g in graphs.iteritems(): if g: ion() for k in g.keys(): if k in graph_n.keys(): print k, 'n: ', graph_n[k] if n in ['SSE', 'NLPD']: g = DataFrame( dict([(k, Series(v)) for k, v in g.iteritems()])) ax = g.plot(kind='box', title=n) check_dir_exists('../graph_data/') g.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) if n in ['ER', 'MSSE']: g = DataFrame( dict([(k, Series(v)) for k, v in g.iteritems()])) check_dir_exists('../graph_data/') g.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) m = g.mean() errors = g.std() ax = m.plot(kind='bar', yerr=errors, title=n) patches, labels = ax.get_legend_handles_labels() ax.legend(patches, labels, loc='lower center') if n in ['intensity']: X = g.values()[0][:, 0] true_data = DataFrame({'x': X, 'y': g.values()[0][:, 3]}) true_data.to_csv('../graph_data/' + name + '_' + 'true_y' + '_data.csv', index=False) plt.figure() color = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] c = 0 check_dir_exists('../graph_data/') graph_data = DataFrame() for k, v in g.iteritems(): # plt.plot(X, v[:, 1], hold=True, color=color[c], label=k) # plt.fill_between(X, v[:, 1] - 2 * np.sqrt(v[:, 2]), v[:, 1] + 2 * np.sqrt(v[:, 2]), alpha=0.2, facecolor=color[c]) graph_data = graph_data.append( DataFrame({ 'x': X, 'm': v[:, 1], 'v': v[:, 2], 'model_sp': [k] * X.shape[0] })) c += 1 plt.legend(loc='upper center') graph_data.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) show(block=True)
print("feature-column.py metric=" + args.metric + " out=" + args.out) def read_ssv(fname): lines = [line.split() for line in open(fname, 'r')] if args.format.lower() == 'galago_eval': return lines elif args.format.lower() == 'trec_eval': return [[line[1], line[0]] + line[2:] for line in lines] namestsv = read_ssv(args.names) namesDict = {row[0]: row[2][8:] for row in namestsv} for run in args.runs: tsv = read_ssv(run) values = [float(row[2]) for row in tsv if row[0] in namesDict] tsv = read_ssv(run) labels = [namesDict[row[0]] for row in tsv if row[0] in namesDict] df2 = DataFrame(values, index=labels, columns=[os.path.basename(run)]) plt.figure() df2.plot(kind='bar', color=['1.0', '0.70', '0.0', '0.50']) plt.ylabel(args.metric, fontsize=20) plt.tick_params(axis='both', which='major', labelsize=10) plt.xticks(rotation=90) plt.savefig(args.out + os.path.basename(run) + '.pdf', bbox_inches='tight') # plt.show()
seriesDict['mean'][run] = mean seriesDict['stderr'][run] = stderr print("dropping queries because of NaN values: " + " ".join(queriesWithNanValues)) print('\t'.join(['run', 'mean/stderr'])) for run in datas: if not run == args.runs[0]: print('\t'.join([ run, str(seriesDict['mean'][run]), str(seriesDict['stderr'][run]) ])) df1 = DataFrame(seriesDict, index=args.runs) df2 = df1['mean'] df2.index = [os.path.basename(label) for label in df1.index] plt.figure() df2.plot(kind='bar', yerr=df1['stderr'], color=[ '0.0', '0.6', '0.4', '0.8', '0.4', '0.8', '0.4', '0.8', '0.4', '0.8' ]) plt.ylabel(args.metric, fontsize=20) plt.tick_params(axis='both', which='major', labelsize=20) plt.xticks(rotation=90) plt.savefig(args.out, bbox_inches='tight')