def parse_sw_history2(begin_date='2014-03-12', end_date=None, code='801150'): if end_date is None: now = arrow.now() end_date = str(now.date()) condition = 'swindexcode=\'{}\' and BargainDate>=\'{}\' and BargainDate<=\'{}\' and type=\'Day\'' where = condition.format(code, begin_date, end_date) all_data = [] for index in range(1, 1000): payload = {'tablename':'V_Report', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 993, 'timed': 1456667319778 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') print(data) result = json.loads(data) data_list = result.get('root') # print 'url****'+url # print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) # print df # print df.info() # print df.describe() # print df['PE'] # print df[df['BargainDate'] == '2015-10-16 0:00:00'] if 'PE' not in df: return # clean data with empty PE or PB df = df[df['PE'] != ''] df = df[df['PB'] != ''] # convert string to datetime(timestamp) df['BargainDate'] = pd.to_datetime(df['BargainDate']) # convert string to float df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) print(df) # df_sort_pe = df.sort(columns='PE', ascending=True) df_sort_pe = df.sort_values(by='PE', ascending=True) # print df_sort_pe # df_sort_pb = df.sort(columns='PB', ascending=True) df_sort_pb = df.sort_values(by='PB', ascending=True) # print df_sort_pb # print 'PE mean:{}'.format(df['PE'].mean()) # print 'PB mean:{}'.format(df['PB'].mean()) # print 'PB<1:{}'.format(df[df.PB < 1]) return df
def parse_sw_history2(begin_date='2014-03-12', end_date=None, code='801150'): if end_date is None: now = arrow.now() end_date = str(now.date()) condition = 'swindexcode=\'{}\' and BargainDate>=\'{}\' and BargainDate<=\'{}\' and type=\'Day\'' where = condition.format(code, begin_date, end_date) all_data = [] for index in range(1, 1000): payload = {'tablename':'V_Report', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 1, 'timed': 1456667319778 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') # print data result = json.loads(data) data_list = result.get('root') # print 'url****'+url # print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) # print df # print df.info() # print df.describe() # print df['PE'] # print df[df['BargainDate'] == '2015-10-16 0:00:00'] # clean data with empty PE or PB df = df[df['PE'] != ''] df = df[df['PB'] != ''] # convert string to datetime(timestamp) df['BargainDate'] = pd.to_datetime(df['BargainDate']) # convert string to float df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) print(df) # df_sort_pe = df.sort(columns='PE', ascending=True) df_sort_pe = df.sort_values(by='PE', ascending=True) # print df_sort_pe # df_sort_pb = df.sort(columns='PB', ascending=True) df_sort_pb = df.sort_values(by='PB', ascending=True) # print df_sort_pb # print 'PE mean:{}'.format(df['PE'].mean()) # print 'PB mean:{}'.format(df['PB'].mean()) # print 'PB<1:{}'.format(df[df.PB < 1]) return df
def ah_premium_index(samples=[('600036', '03968'), ('600196', '02196'), ('601111', '00753')]): samples = [('600585', '00914'), ('601318', '02318'), ('000002', '02202'), ('600036', '03968'), ('600600', '00168'), ('600196', '02196'), ('600030', '06030'), ('600028', '00386'), ('601601', '02601'), ('601628', '02628'), ('000063', '00763'), ('601398', '01398'), ('601939', '00939'), ('601288', '01288'), ('600837', '06837'), ('601607', '02607'), ('600011', '00902'), ('002202', '02208'), ('601988', '03988'), ('601818', '06818'), ('601336', '01336'), ('600027', '01071'), ('601088', '01088'), ('601328', '03328'), ('600016', '01988'), ('601998', '00998'), ('601186', '01186'), ('600332', '00874'), ('601766', '01766'), ('002594', '01211'), ('601857', '00857'), ('000039', '02039'), ('600362', '00358'), ('600012', '00995'), ('601633', '02333'), ('601800', '01800'), ('601333', '00525'), ('601111', '00753'), ('600875', '01072'), ('601390', '00390'), ('601898', '01898'), ('601899', '02899'), ('000898', '00347'), ('000157', '01157'), ('600685', '00317'), ('601992', '02009'), ('601600', '02600'), ('601991', '00991'), ('600115', '00670'), ('601808', '02883'), ('600871', '01033'), ('601727', '02727'), ('600188', '01171'), ('601238', '02238'), ('601919', '01919'), ('601866', '02866'), ('601618', '01618'), ('600026', '01138'), ('601880', '02880'), ('600874', '01065'), ('600660', '03606'), ('600377', '00177'), ('000776', '01776'), ('601688', '06886'), ('000338', '02338'), ('600029', '01055'), ('603993', '03993'), ('601005', '01053'), ('600688', '00338'), ('600548', '00548'), ('002672', '00895'), ('000513', '01513'), ('000488', '01812'), ('601107', '00107'), ('601588', '00588'), ('600808', '00323'), ('000921', '00921'), ('600775', '00553'), ('600860', '00187'), ('000756', '00719'), ('601038', '00038'), ('600806', '00300'), ('002490', '00568'), ('002703', '01057'), ('600876', '01108'), ('601717', '00564'), ('000585', '00042')] a_list = [] h_list = [] price_a_list = [] price_h_list = [] ratio_list = [] hk_to_rmb = float(rmb_exchange_rate()[0])/100 for sample in samples: ratio = ah_ratio(hk_to_rmb, sample) if ratio: a_list.append(sample[0]) h_list.append(sample[1]) price_a_list.append(ratio.get('price_a')) price_h_list.append(ratio.get('price_h')) ratio_list.append(ratio.get('ratio')) df_dict = {'A': a_list, 'Price_A': price_a_list, 'H': h_list, 'Price_H': price_h_list, 'ratio': ratio_list} # print df_dict df = DataFrame(df_dict) # print df df = df.sort(columns='ratio', ascending=True) # print df # ah_index = np.mean(ratio_list) ah_index = df['ratio'].mean() # print 'ah_index:{}'.format(ah_index) # print 'discount stock:{}'.format(df[df.ratio < 1]) return AhIndex(ah_index)
def ah_premium_index(samples=[('600036', '03968'), ('600196', '02196'), ('601111', '00753')]): samples = [('600585', '00914'), ('601318', '02318'), ('000002', '02202'), ('600036', '03968'), ('600600', '00168'), ('600196', '02196'), ('600030', '06030'), ('600028', '00386'), ('601601', '02601'), ('601628', '02628'), ('000063', '00763'), ('601398', '01398'), ('601939', '00939'), ('601288', '01288'), ('600837', '06837'), ('601607', '02607'), ('600011', '00902'), ('002202', '02208'), ('601988', '03988'), ('601818', '06818'), ('601336', '01336'), ('600027', '01071'), ('601088', '01088'), ('601328', '03328'), ('600016', '01988'), ('601998', '00998'), ('601186', '01186'), ('600332', '00874'), ('601766', '01766'), ('002594', '01211'), ('601857', '00857'), ('000039', '02039'), ('600362', '00358'), ('600012', '00995'), ('601633', '02333'), ('601800', '01800'), ('601333', '00525'), ('601111', '00753'), ('600875', '01072'), ('601390', '00390'), ('601898', '01898'), ('601899', '02899'), ('000898', '00347'), ('000157', '01157'), ('600685', '00317'), ('601992', '02009'), ('601600', '02600'), ('601991', '00991'), ('600115', '00670'), ('601808', '02883'), ('600871', '01033'), ('601727', '02727'), ('600188', '01171'), ('601238', '02238'), ('601919', '01919'), ('601866', '02866'), ('601618', '01618'), ('600026', '01138'), ('601880', '02880'), ('600874', '01065'), ('600660', '03606'), ('600377', '00177'), ('000776', '01776'), ('601688', '06886'), ('000338', '02338'), ('600029', '01055'), ('603993', '03993'), ('601005', '01053'), ('600688', '00338'), ('600548', '00548'), ('002672', '00895'), ('000513', '01513'), ('000488', '01812'), ('601107', '00107'), ('601588', '00588'), ('600808', '00323'), ('000921', '00921'), ('600775', '00553'), ('600860', '00187'), ('000756', '00719'), ('601038', '00038'), ('600806', '00300'), ('002490', '00568'), ('002703', '01057'), ('600876', '01108'), ('601717', '00564'), ('000585', '00042')] a_list = [] h_list = [] price_a_list = [] price_h_list = [] ratio_list = [] hk_to_rmb = float(rmb_exchange_rate()[0])/100 for sample in samples: ratio = ah_ratio(hk_to_rmb, sample) if ratio: a_list.append(sample[0]) h_list.append(sample[1]) price_a_list.append(ratio.get('price_a')) price_h_list.append(ratio.get('price_h')) ratio_list.append(ratio.get('ratio')) df_dict = {'A': a_list, 'Price_A': price_a_list, 'H': h_list, 'Price_H': price_h_list, 'ratio': ratio_list} print df_dict df = DataFrame(df_dict) # print df df = df.sort(columns='ratio', ascending=True) print df # ah_index = np.mean(ratio_list) ah_index = df['ratio'].mean() print 'ah_index:{}'.format(ah_index) print 'discount stock:{}'.format(df[df.ratio < 1]) return AhIndex(ah_index)
def parse_sw_history(begin_date='2014-03-12', end_date=None, codes=None): if end_date is None: now = arrow.now() end_date = str(now.date()) if codes is None: codes = ('801010', '801020', '801030', '801040', '801050', '801060', '801070', '801080', '801090', '801100', '801110', '801120', '801130', '801140', '801150', '801160', '801170', '801180', '801190', '801200', '801210', '801220', '801230', '801710', '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880', '801890') condition = 'swindexcode in {} and BargainDate>=\'{}\' and BargainDate<=\'{}\'' where = condition.format(codes, begin_date, end_date) print where all_data = [] for index in range(1, 1000): payload = {'tablename':'swindexhistory', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 28, 'timed': 1453385628267 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') result = json.loads(data) data_list = result.get('root') print 'url****'+url print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) # df['PE'] = df['PE'].astype(float) # df['PB'] = df['PB'].astype(float) print '*'*20 print len(df) print df df = df.sort(columns='PE', ascending=True) print df df = df.sort(columns='PB', ascending=True) print df print 'PE mean:{}'.format(df['PE'].mean()) print 'PB mean:{}'.format(df['PB'].mean()) print 'PB<1:{}'.format(df[df.PB < 1]) return df
def parse_sw_history(begin_date='2014-03-12', end_date=None, codes=None): if end_date is None: now = arrow.now() end_date = str(now.date()) if codes is None: codes = ('801010', '801020', '801030', '801040', '801050', '801060', '801070', '801080', '801090', '801100', '801110', '801120', '801130', '801140', '801150', '801160', '801170', '801180', '801190', '801200', '801210', '801220', '801230', '801710', '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880', '801890') condition = 'swindexcode in {} and BargainDate>=\'{}\' and BargainDate<=\'{}\'' where = condition.format(codes, begin_date, end_date) # print where all_data = [] for index in range(1, 1000): payload = {'tablename':'swindexhistory', 'key': 'id', 'p': index, 'where': where, 'orderby': 'swindexcode asc,BargainDate_1', 'fieldlist': 'SwIndexCode,SwIndexName,BargainDate,CloseIndex,BargainAmount,Markup,' 'TurnoverRate,PE,PB,MeanPrice,BargainSumRate,DP', 'pagecount': 28, 'timed': 1453385628267 } url = 'http://www.swsindex.com/handler.aspx' res = requests.post(url, data=payload) data = res.text.replace('\'', '\"') result = json.loads(data) data_list = result.get('root') # print 'url****'+url # print len(data_list) if len(data_list) == 0: break else: all_data.extend(data_list) df = DataFrame(all_data) df[['PE', 'PB']] = df[['PE', 'PB']].astype(float) # df['PE'] = df['PE'].astype(float) # df['PB'] = df['PB'].astype(float) # print '*'*20 # print len(df) # print df df = df.sort(columns='PE', ascending=True) # print df df = df.sort(columns='PB', ascending=True) # print df # print 'PE mean:{}'.format(df['PE'].mean()) # print 'PB mean:{}'.format(df['PB'].mean()) # print 'PB<1:{}'.format(df[df.PB < 1]) return df
def xueqiu_history(code='600036', access_token=xq_a_token, begin_date=None, end_date=None): if begin_date is None: begin = arrow.get('2014-01-01') begin_date = begin.timestamp*1000 # print begin_date if end_date is None: end = arrow.now() end_date = end.timestamp*1000 if len(code) == 8: pass elif code.startswith('60') or code.startswith('51'): code = 'SH'+code elif len(code) == 5: code = 'HK'+code elif len(code) == 6: code = 'SZ'+code url = 'http://xueqiu.com/stock/forchartk/stocklist.json?symbol={}&period=1day&type=normal&begin={}&end={}&_=1443694358741' url = url.format(code, begin_date, end_date) payload = {'access_token': access_token} r = requests.get(url, params=payload, headers=headers) # print r.json() data_list = r.json().get('chartlist') # print data_list # print len(data_list) result = [] for data in data_list: # print data time = data.get('time') time = arrow.get(time, 'ddd MMM DD HH:mm:ss Z YYYY') # print time timestamp = time.timestamp*1000 history = StockHistory(code=code, percent=data.get('percent'), ma5=data.get('ma5'), ma10=data.get('ma10'), ma30=data.get('ma30'), open_price=data.get('open'), high=data.get('high'), low=data.get('low'), close=data.get('close'), time=time.datetime, timestamp=timestamp, volume=data.get('volume'), # 注:指数无法取得换手率 turn_rate=data.get('turnrate')) # print history result.append(history) df = DataFrame(data_list) # print df max_turnover = df['turnrate'].max() min_turnover = df['turnrate'].min() # print df['turnrate'].mean() # max_turnover_index = df.loc[df['turnrate'] == max_turnover].index # print max_turnover_index columns = ['time', 'turnrate', 'volume', 'close'] # print df.loc[df['turnrate'] == max_turnover][columns] # print df.loc[df['turnrate'] == min_turnover][columns] max_volume = df['volume'].max() min_volume = df['volume'].min() mean_volume = df['volume'].mean() # print df.loc[df['volume'] == max_volume][columns] # print df.loc[df['volume'] == min_volume][columns] return result
def read_index2(code='000905'): url = 'http://www.csindex.com.cn/uploads/file/autofile/perf/{}perf.xls'.format( code) book = get_excel_book(url) # print(book) if code == '000300': name = '沪深300' elif code == '000905': name = '中证500' elif code == '000016': name = '上证50' for sheet in range(book.nsheets): sh = book.sheet_by_index(sheet) for rx in range(sh.nrows): row = sh.row(rx) df = DataFrame(row) # print(df) print(row) print(len(row)) if len(row) > 15: date = row[0].value pe1 = row[15].value pe2 = row[16].value dividend_yield_ratio1 = row[17].value dividend_yield_ratio2 = row[18].value turnover = row[13].value # # print(type(pe)) if date and pe1 and type(pe1) == float: py_date = xlrd.xldate.xldate_as_datetime( date, book.datemode) print(py_date) date = str(py_date) print(pd.to_datetime(date)) Index.objects(name=name, date=date).update_one( name=name, date=date, pe=pe1, pe_ttm=pe2, dividend_yield_ratio=dividend_yield_ratio1, turnover=turnover, upsert=True)
crime_type_dict[type]=value for predicted in y_pred: for key in crime_type_dict.keys(): if key!=predicted: zero_append = crime_type_dict[key] zero_append.append(0) crime_type_dict[key] = zero_append else: one_append = crime_type_dict[key] one_append.append(1) crime_type_dict[key] = one_append output = DataFrame(crime_type_dict) #output.index += 1 output.to_csv('output_predict.csv',sep=',',index_label='Id') #print("Number of mislabeled points out of a total %d points : %d" % (X.shape[0],(labels != y_pred).sum())) #s = Series(file_header) #correlation(training_data,labels) # for index, row in data_frame.iterrows(): # list_district.append(get_district_mapping(row['PdDistrict'])) # list_category.append(get_category_mapping(row['Category'])) # # print 'Number of Districts',len(list_district) # print 'Number of Crimes',len(list_category) # # colors = cm.rainbow(np.linspace(0,1,len(list_district))) #
begin = blairInside['time'][0] end = blairInside['time'][-1] duration = end - begin steps = 300 slices = [(begin + duration * step / steps, begin + duration * (step + 2) / steps) for step in range(0, steps - 2)] slicedDates = [begin + (begin - end) / 2 for (begin, end) in slices ] # re-center date in middle of avg window idx = pandas.to_datetime(slicedDates, unit='s', utc=True) df1 = DataFrame( { 'inside': mean_std(blairInside, slices)[0], 'outside': mean_std(blairOutside, slices)[0] }, index=idx, columns=['inside', 'outside']) df1.plot(kind='line') plt.gca().xaxis.set_major_formatter( matplotlib.dates.DateFormatter('%H:%M', tz=timezone("America/New_York"))) def fill_error_tube(b, color): (mean, error) = mean_std(b, slices) plt.fill_between(df1.index, mean - error, mean + error, color=color) fill_error_tube(blairInside, [0.5, 0.5, 0.5, 0.5]) fill_error_tube(blairOutside, [0.5, 0.5, 0.5, 0.5])
print("feature-column.py metric=" + args.metric + " out=" + args.out) def read_ssv(fname): lines = [line.split() for line in open(fname, 'r')] if args.format.lower() == 'galago_eval': return lines elif args.format.lower() == 'trec_eval': return [[line[1], line[0]] + line[2:] for line in lines] namestsv = read_ssv(args.names) namesDict = {row[0]: row[2][8:] for row in namestsv} for run in args.runs: tsv = read_ssv(run) values = [float(row[2]) for row in tsv if row[0] in namesDict] tsv = read_ssv(run) labels = [namesDict[row[0]] for row in tsv if row[0] in namesDict] df2 = DataFrame(values, index=labels, columns=[os.path.basename(run)]) plt.figure() df2.plot(kind='bar', color=['1.0', '0.70', '0.0', '0.50']) plt.ylabel(args.metric, fontsize=20) plt.tick_params(axis='both', which='major', labelsize=10) plt.xticks(rotation=90) plt.savefig(args.out + os.path.basename(run) + '.pdf', bbox_inches='tight') # plt.show()
def plot_output(name, infile_path, model_names, filter): """ Reads predictions from csv files and generates plots and output csv. Input csv files should be in the infile_path with following structure: ``infile_path`` / ../any_name/ ../config.csv, test_.csv,train_.csv ../any_name2 ../config.csv, test_.csv,train_.csv The function also exports the data used to generate graphs as csv files the following folder: ../graph_data these csv files can be used to reproduce outputs. Parameters ---------- name : string name of the csv files to which data will be exported infile_path : string the folder which contains csv for configs and test and train model_names : list name of the sub-directories in ``infile_path`` to consider filter : callable a filter which will be applied in config files to filter which configs should be considered. For example, lambda x: x['method'] == 'full' will only consider outputs which used 'full' method """ graphs = { 'SSE': {}, 'MSSE': {}, 'NLPD': {}, 'ER': {}, 'intensity': {}, } graph_n = {} for m in model_names: data_config = PlotOutput.read_config(infile_path + m + '/' + model_logging.CONFIG_FILE_NAME) if filter is None or filter(data_config): data_test = pandas.read_csv(infile_path + m + '/' + model_logging.PREDICTIONS_FILE_NAME) cols = data_test.columns dim = 0 for element in cols: if element.startswith('true_Y'): dim += 1 data_train = pandas.read_csv(infile_path + m + '/' + model_logging.TRAINING_FILE_NAME) Y_mean = data_train['Y_0'].mean() Ypred = np.array([data_test['predicted_Y_%d' % (d)] for d in range(dim)]) Ytrue = np.array([data_test['true_Y_%d' % (d)] for d in range(dim)]) Yvar = np.array([data_test['predicted_variance_%d' % (d)] for d in range(dim)]) if not (PlotOutput.config_to_str(data_config) in graph_n.keys()): graph_n[PlotOutput.config_to_str(data_config)] = 0 graph_n[PlotOutput.config_to_str(data_config)] += 1 if data_config['ll'] in [CogLL.__name__]: for i in range(Ytrue.shape[0]): Y_mean = data_train['Y_' + str(i)].mean() PlotOutput.add_to_list(graphs['MSSE'], PlotOutput.config_to_str(data_config) + '_' + str(i), ((Ypred[i] - Ytrue[i])**2).mean() / ((Y_mean - Ytrue[i]) ** 2).mean()) NLPD = np.array(data_test['NLPD_' + str(i)]) PlotOutput.add_to_list(graphs['NLPD'], PlotOutput.config_to_str(data_config) + '_' + str(i), NLPD) if data_config['ll'] in [UnivariateGaussian.__name__, WarpLL.__name__]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list(graphs['SSE'], PlotOutput.config_to_str(data_config), (Ypred[0] - Ytrue[0])**2 / ((Y_mean - Ytrue[0]) **2).mean()) PlotOutput.add_to_list(graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [LogisticLL.__name__]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list(graphs['ER'], PlotOutput.config_to_str(data_config), np.array([(((Ypred[0] > 0.5) & (Ytrue[0] == -1)) | ((Ypred[0] < 0.5) & (Ytrue[0] == 1)) ).mean()])) PlotOutput.add_to_list(graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [SoftmaxLL.__name__]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list(graphs['ER'], PlotOutput.config_to_str(data_config), np.array( [(np.argmax(Ytrue, axis=0) != np.argmax(Ypred, axis=0)).mean()])) PlotOutput.add_to_list(graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [LogGaussianCox.__name__]: X0 = np.array([data_test['X_0']]) PlotOutput.add_to_list(graphs['intensity'], PlotOutput.config_to_str(data_config), np.array([X0[0,:]/365+1851.2026, Ypred[0, :], Yvar[0, :], Ytrue[0, :]]).T) for n, g in graphs.iteritems(): if g: ion() for k in g.keys(): if k in graph_n.keys(): print k, 'n: ', graph_n[k] if n in ['SSE', 'NLPD']: g= DataFrame(dict([(k,Series(v)) for k,v in g.iteritems()])) ax = g.plot(kind='box', title=n) check_dir_exists('../graph_data/') g.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) if n in ['ER', 'MSSE']: g= DataFrame(dict([(k,Series(v)) for k,v in g.iteritems()])) check_dir_exists('../graph_data/') g.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) m = g.mean() errors = g.std() ax =m.plot(kind='bar', yerr=errors, title=n) patches, labels = ax.get_legend_handles_labels() ax.legend(patches, labels, loc='lower center') if n in ['intensity']: X = g.values()[0][:, 0] true_data = DataFrame({'x': X, 'y': g.values()[0][:, 3]}) true_data.to_csv('../graph_data/' + name + '_' + 'true_y' + '_data.csv', index=False) plt.figure() color = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] c = 0 check_dir_exists('../graph_data/') graph_data = DataFrame() for k,v in g.iteritems(): # plt.plot(X, v[:, 1], hold=True, color=color[c], label=k) # plt.fill_between(X, v[:, 1] - 2 * np.sqrt(v[:, 2]), v[:, 1] + 2 * np.sqrt(v[:, 2]), alpha=0.2, facecolor=color[c]) graph_data = graph_data.append(DataFrame({'x': X, 'm' : v[:, 1], 'v' :v[:, 2], 'model_sp' :[k] * X.shape[0]} )) c += 1 plt.legend(loc='upper center') graph_data.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) show(block=True)
random.seed(42) classifier = skflow.TensorFlowEstimator(model_fn=dnn_tanh, n_classes=2, batch_size=128, steps=2000, learning_rate=0.02) classifier.fit(X=x_data, y=y_data) score = metrics.accuracy_score(y_data, classifier.predict(x_data)) print("Accuracy: %f" % score) #0.823793 test_data = pd.read_csv('../resources/test_titanic.csv',header=0) x_data2 = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] x_data2.Sex = x_data2.Sex.map( {'female': 1, 'male': 0} ) x_data2.Age.fillna(value=random.randint(1, 100),inplace=True) x_data2.Fare.fillna(value=x_data2.Fare.mean(),inplace=True) x_data2.Embarked.fillna('C',inplace=True) x_data2.Embarked = x_data2.Embarked.map( {'C': 1, 'S': 2, 'Q':3, '':4 } ) x_data2['Pclass_Log'] = x_data2['Pclass'].map(lambda x: math.log(x)) x_data2['Sex_Log'] = x_data2['Sex'].map(lambda x: math.log(x+1)) x_data2['Age_Log'] = x_data2['Age'].map(lambda x: math.log(x)) x_data2['SibSp_Log'] = x_data2['SibSp'].map(lambda x: math.log(x+1)) x_data2['Parch_Log'] = x_data2['Parch'].map(lambda x: math.log(x+1)) x_data2['Fare_Log'] = x_data2['Fare'].map(lambda x: math.log(x+1)) x_data2['Embarked_Log'] = x_data2['Embarked'].map(lambda x: math.log(x)) test_predict = classifier.predict(x_data2) test_id = test_data['PassengerId'] data = {'PassengerId':test_data['PassengerId'],'Survived':test_predict} result = DataFrame(data) result.to_csv('result.csv',index=False)#0.75598__author__ = 'zhangwj'
def read_history(code='600036', begin_date=None, end_date=None): if begin_date is None: begin = arrow.get('2014-01-01') else: begin = arrow.get(begin_date) # print begin_date if end_date is None: end = arrow.now() else: end = arrow.get(end_date) code2 = code if len(code) == 8: pass elif code.startswith('60') or code.startswith('51'): code2 = 'SH' + code elif len(code) == 5: code2 = 'HK' + code elif len(code) == 6: code2 = 'SZ' + code # url = '{}/stock/forchartk/stocklist.json?symbol={}&period=1day&type=normal&begin={}&end={}&_=1443694358741' url = '{}/stock/forchartk/stocklist.json?symbol={}&period=1day&type=before&begin={}&end={}' url = url.format(api_home, code2, begin.timestamp * 1000, end.timestamp * 1000) # print(url) payload = {'access_token': xq_a_token} r = requests.get(url, params=payload, headers=headers) print(r.json()) data_list = r.json().get('chartlist') # print data_list # print len(data_list) result = [] for data in data_list: print(data) time = data.get('time') time = arrow.get(time, 'ddd MMM DD HH:mm:ss Z YYYY') date = time.format('YYYY-MM-DD') # print('date:{}'.format(date)) # timestamp = time.timestamp*1000 # history = StockHistory(code=code, percent=data.get('percent'), # ma5=data.get('ma5'), ma10=data.get('ma10'), ma30=data.get('ma30'), # open_price=data.get('open'), high=data.get('high'), low=data.get('low'), # close=data.get('close'), time=time.datetime, timestamp=timestamp, # volume=data.get('volume'), # # 注:指数无法取得换手率 # turn_rate=data.get('turnrate')) # print(Equity.objects(code=code, date=date)) Equity.objects(code=code, date=date).update_one(percent=data.get('percent'), open=data.get('open'), high=data.get('high'), low=data.get('low'), close=data.get('close'), volume=data.get('volume'), upsert=True) nh = False nl = False # if high == high52week: # nh = True # if low == low52week: # nl = True # Equity.objects(code=code, date=date).update_one(percent=data.get('percent'), # ma5=data.get('ma5'), ma10=data.get('ma10'), ma30=data.get('ma30'), # open_price=data.get('open'), high=data.get('high'), low=data.get('low'), # close=data.get('close'), time=time.datetime, timestamp=timestamp, # volume=data.get('volume'), # # 注:指数无法取得换手率 # turn_rate=data.get('turnrate'), upsert=True) # print history # result.append(history) df = DataFrame(data_list) # print df max_turnover = df['turnrate'].max() min_turnover = df['turnrate'].min() # print df['turnrate'].mean() # max_turnover_index = df.loc[df['turnrate'] == max_turnover].index # print max_turnover_index columns = ['time', 'turnrate', 'volume', 'close'] # print df.loc[df['turnrate'] == max_turnover][columns] # print df.loc[df['turnrate'] == min_turnover][columns] max_volume = df['volume'].max() min_volume = df['volume'].min() mean_volume = df['volume'].mean() # print df.loc[df['volume'] == max_volume][columns] # print df.loc[df['volume'] == min_volume][columns] return result
seriesDict = {key: dict() for key in queriesDiff} for run in datas: data = datas[run] mean = np.average([data[key] for key in queries]) stderr = np.std([data[key] for key in queries]) / sqrt(len(queries)) for (label, queriesByD) in queriesDiff.items(): seriesDict[label][run] = np.average( [data[key] for (key, x) in queriesByD]) print("dropping queries because of NaN values: " + " ".join(queriesWithNanValues)) df1 = DataFrame(seriesDict, columns=("0%-5%", "5%-25%", '25%-50%', '50%-75%', '75%-95%', '95%-100%'), index=args.runs) df2 = df1 df2.index = [os.path.basename(label) for label in df1.index] df3 = df2.transpose() plt.figure() df3.plot(kind='bar', label=args.metric, color=['0.0', '0.80', '0.4', '0.9', '0.70']) leg = plt.legend(loc='best', fancybox=True) leg.get_frame().set_alpha(0.5) plt.tick_params(axis='both', which='major', labelsize=11) plt.xticks(rotation=0) plt.ylabel(args.metric, fontsize=20)
y_data = ori_data['Survived'] # specify parameters via map param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } num_round = 2 dtrain = xgb.DMatrix(np.array(x_data),np.array(y_data)) bst = xgb.train(param, dtrain, num_round) test_data = pd.read_csv('../resources/test_titanic.csv',header=0) x_data2 = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] x_data2.Sex = x_data2.Sex.map( {'female': 1, 'male': 0} ) x_data2.Age.fillna(value=random.randint(1, 100),inplace=True) x_data2.Fare.fillna(value=x_data2.Fare.mean(),inplace=True) x_data2.Embarked.fillna('C',inplace=True) x_data2.Embarked = x_data2.Embarked.map( {'C': 1, 'S': 2, 'Q':3, '':4 } ) x_data2['Pclass_Log'] = x_data2['Pclass'].map(lambda x: math.log(x)) x_data2['Sex_Log'] = x_data2['Sex'].map(lambda x: math.log(x+1)) x_data2['Age_Log'] = x_data2['Age'].map(lambda x: math.log(x)) x_data2['SibSp_Log'] = x_data2['SibSp'].map(lambda x: math.log(x+1)) x_data2['Parch_Log'] = x_data2['Parch'].map(lambda x: math.log(x+1)) x_data2['Fare_Log'] = x_data2['Fare'].map(lambda x: math.log(x+1)) x_data2['Embarked_Log'] = x_data2['Embarked'].map(lambda x: math.log(x)) dtest = xgb.DMatrix(np.array(x_data2)) preds = bst.predict(dtest) # print(preds) data = {'PassengerId':test_data['PassengerId'],'Survived':preds} result = DataFrame(data) # result.to_csv('result.csv',index=False)#0.75598
def main(): mpl.use("Agg") def read_ssv(fname): lines = [line.split() for line in open(fname, 'r')] if args.format.lower() == 'galago_eval': return lines elif args.format.lower() == 'trec_eval': return [[line[1], line[0]] + line[2:] for line in lines] def readNumQueries(run): tsv = read_ssv(run) data = [int(row[2]) for row in tsv if row[0] == "all" and row[1] == numQueries_key] return data[0] def findQueriesWithNanValues(run): tsv = read_ssv(run) # print ("tsv,", tsv) queriesWithNan = {row[0] for row in tsv if row[1] == 'num_rel' and (float(row[2]) == 0.0 or math.isnan(float(row[2])))} return queriesWithNan def fetchValues(run): tsv = read_ssv(run) data = {row[0]: float(row[2]) for row in tsv if row[1] == args.metric and not math.isnan(float(row[2]))} return data args = parser.parse_args() pairedt = pairedttest.pairedt(best=True, format=args.format, metric=args.metric, runs=args.runs) print("paired t") print(pairedt) print("=-----=") numQueries_key = "num_q" print("column.py metric="+args.metric+" out="+args.out) datas = {run: fetchValues(run) for run in args.runs} # deal with nans queriesWithNanValues = {'all'}.union(*[findQueriesWithNanValues(run) for run in args.runs]) basedata=datas[args.runs[0]] queries = set(basedata.keys()).difference(queriesWithNanValues) numQueries = readNumQueries(args.runs[0]) if args.c else len(queries) seriesDict = {'mean':dict(), 'stderr':dict()} for run in datas: data = datas[run] if sum(not key in data for key in queries) > 0: print("data for run "+run+" does not contain all queries "+" ".join(queries)) mean = np.sum([data.get(key, 0.0) for key in queries]) / numQueries stderr = np.std([data.get(key, 0.0) for key in queries] + ([0.0]* (numQueries - len(queries)))) / sqrt(numQueries) seriesDict['mean'][run]=mean seriesDict['stderr'][run]=stderr print( "dropping queries because of NaN values: "+ " ".join(queriesWithNanValues)) print ('\t'.join(['run', 'mean/stderr'])) for run in datas: #if not run == args.runs[0]: print ('\t'.join([run, str(seriesDict['mean'][run]), str(seriesDict['stderr'][run])])) df1 = DataFrame(seriesDict, index=pd.Index(args.runs)) if args.sort: df1.sort_values('mean',ascending=False,inplace=True) df2 = df1['mean'] df2.index=[os.path.basename(label) for label in df1.index] df1.index=[os.path.basename(label) for label in df1.index] print('df2.index=',df2.index) df2.text=['**' if (math.isnan(pairedt[label][1]) or pairedt[label][1]>0.05) else '' for label in df2.index] min_same_idx = max( [i if (math.isnan(pairedt[label][1]) or pairedt[label][1]>0.05) else 0 for i,label in enumerate(df2.index)]) cs = {k:v for k,v in zip(sorted(list(set([label[0:3] for label in df1.index]))), itertools.cycle(['0.1','0.9','0.5','0.3','0.7','0.2','0.8', '0.4','0.6'])) } df1['color']=[cs[label[0:3]] for label in df1.index] print(df1['color']) plt.tick_params(colors=df1.color) fig, ax = plt.subplots() df2.plot.bar(yerr = df1['stderr'], color=df1.color.values, ax=ax) for (p, i) in zip(ax.patches,range(100)): if args.sort : if i==min_same_idx: frompoint=(p.get_x()+p.get_width(), p.get_height()/2.0) topoint=(0.0-p.get_width()/2.0, p.get_height()/2.0) ax.annotate("", xy=topoint, xycoords='data', xytext=frompoint, textcoords='data', arrowprops=dict(arrowstyle="<|-|>", connectionstyle="arc3", ec='r'), ) else: ax.annotate(df2.text[i], xy=(p.get_x() + p.get_width() / 2.0, p.get_height()*0.9), ha='center', va='center',) ax.grid() plt.ylabel(args.metric, fontsize=20) plt.tick_params(axis='both', which='major', labelsize=20) plt.xticks(rotation=90) plt.savefig(args.out, bbox_inches='tight')
mapping_doors = {'2': 0, '3': 1, '4': 2, '5more': 3} mapping_persons = {'2': 0, '4': 1, 'more': 2} mapping_lug = {'small': 0, 'med': 1, 'big': 2} mapping_safety = {'low': 0, 'med': 1, 'high': 2} mapping_class = {'unacc': 1, 'acc': 2, 'good': 3, 'vgood': 4} df['maint'] = df['maint'].map(mapping_buy_maint) df['buying'] = df['buying'].map(mapping_buy_maint) df['doors'] = df['doors'].map(mapping_doors) df['persons'] = df['persons'].map(mapping_persons) df['lug_boot'] = df['lug_boot'].map(mapping_lug) df['safety'] = df['safety'].map(mapping_safety) df['class'] = df['class'].map(mapping_class).astype(int) df = df.reset_index(drop=True) labels_df = DataFrame() labels_df['cat'] = df['class'].copy() features_df = df.copy() features_df = features_df.drop('class', axis=1) c45 = C45Constructor(cf=0.95) cart = CARTConstructor(max_depth=12, min_samples_leaf=2) quest = QuestConstructor(default=1, max_nr_nodes=1, discrete_thresh=10, alpha=0.99) tree_constructors = [c45, cart, quest] tree_confusion_matrices = {} for tree_constructor in tree_constructors:
seriesDict['mean'][run] = mean seriesDict['stderr'][run] = stderr print("dropping queries because of NaN values: " + " ".join(queriesWithNanValues)) print('\t'.join(['run', 'mean/stderr'])) for run in datas: if not run == args.runs[0]: print('\t'.join([ run, str(seriesDict['mean'][run]), str(seriesDict['stderr'][run]) ])) df1 = DataFrame(seriesDict, index=args.runs) df2 = df1['mean'] df2.index = [os.path.basename(label) for label in df1.index] plt.figure() df2.plot(kind='bar', yerr=df1['stderr'], color=[ '0.0', '0.6', '0.4', '0.8', '0.4', '0.8', '0.4', '0.8', '0.4', '0.8' ]) plt.ylabel(args.metric, fontsize=20) plt.tick_params(axis='both', which='major', labelsize=20) plt.xticks(rotation=90) plt.savefig(args.out, bbox_inches='tight')
def plot_output(name, infile_path, model_names, filter): """ Reads predictions from csv files and generates plots and output csv. Input csv files should be in the infile_path with following structure: ``infile_path`` / ../any_name/ ../config.csv, test_.csv,train_.csv ../any_name2 ../config.csv, test_.csv,train_.csv The function also exports the data used to generate graphs as csv files the following folder: ../graph_data these csv files can be used to reproduce outputs. Parameters ---------- name : string name of the csv files to which data will be exported infile_path : string the folder which contains csv for configs and test and train model_names : list name of the sub-directories in ``infile_path`` to consider filter : callable a filter which will be applied in config files to filter which configs should be considered. For example, lambda x: x['method'] == 'full' will only consider outputs which used 'full' method """ graphs = { 'SSE': {}, 'MSSE': {}, 'NLPD': {}, 'ER': {}, 'intensity': {}, } graph_n = {} for m in model_names: data_config = PlotOutput.read_config( infile_path + m + '/' + model_logging.CONFIG_FILE_NAME) if filter is None or filter(data_config): data_test = pandas.read_csv( infile_path + m + '/' + model_logging.PREDICTIONS_FILE_NAME) cols = data_test.columns dim = 0 for element in cols: if element.startswith('true_Y'): dim += 1 data_train = pandas.read_csv(infile_path + m + '/' + model_logging.TRAINING_FILE_NAME) Y_mean = data_train['Y_0'].mean() Ypred = np.array( [data_test['predicted_Y_%d' % (d)] for d in range(dim)]) Ytrue = np.array( [data_test['true_Y_%d' % (d)] for d in range(dim)]) Yvar = np.array([ data_test['predicted_variance_%d' % (d)] for d in range(dim) ]) if not (PlotOutput.config_to_str(data_config) in graph_n.keys()): graph_n[PlotOutput.config_to_str(data_config)] = 0 graph_n[PlotOutput.config_to_str(data_config)] += 1 if data_config['ll'] in [CogLL.__name__]: for i in range(Ytrue.shape[0]): Y_mean = data_train['Y_' + str(i)].mean() PlotOutput.add_to_list( graphs['MSSE'], PlotOutput.config_to_str(data_config) + '_' + str(i), ((Ypred[i] - Ytrue[i])**2).mean() / ((Y_mean - Ytrue[i])**2).mean()) NLPD = np.array(data_test['NLPD_' + str(i)]) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config) + '_' + str(i), NLPD) if data_config['ll'] in [ UnivariateGaussian.__name__, WarpLL.__name__ ]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list( graphs['SSE'], PlotOutput.config_to_str(data_config), (Ypred[0] - Ytrue[0])**2 / ((Y_mean - Ytrue[0])**2).mean()) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [LogisticLL.__name__]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list( graphs['ER'], PlotOutput.config_to_str(data_config), np.array([ (((Ypred[0] > 0.5) & (Ytrue[0] == -1)) | ((Ypred[0] < 0.5) & (Ytrue[0] == 1))).mean() ])) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [SoftmaxLL.__name__]: NLPD = np.array(data_test['NLPD_0']) PlotOutput.add_to_list( graphs['ER'], PlotOutput.config_to_str(data_config), np.array([(np.argmax(Ytrue, axis=0) != np.argmax( Ypred, axis=0)).mean()])) PlotOutput.add_to_list( graphs['NLPD'], PlotOutput.config_to_str(data_config), NLPD) if data_config['ll'] in [LogGaussianCox.__name__]: X0 = np.array([data_test['X_0']]) PlotOutput.add_to_list( graphs['intensity'], PlotOutput.config_to_str(data_config), np.array([ X0[0, :] / 365 + 1851.2026, Ypred[0, :], Yvar[0, :], Ytrue[0, :] ]).T) for n, g in graphs.iteritems(): if g: ion() for k in g.keys(): if k in graph_n.keys(): print k, 'n: ', graph_n[k] if n in ['SSE', 'NLPD']: g = DataFrame( dict([(k, Series(v)) for k, v in g.iteritems()])) ax = g.plot(kind='box', title=n) check_dir_exists('../graph_data/') g.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) if n in ['ER', 'MSSE']: g = DataFrame( dict([(k, Series(v)) for k, v in g.iteritems()])) check_dir_exists('../graph_data/') g.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) m = g.mean() errors = g.std() ax = m.plot(kind='bar', yerr=errors, title=n) patches, labels = ax.get_legend_handles_labels() ax.legend(patches, labels, loc='lower center') if n in ['intensity']: X = g.values()[0][:, 0] true_data = DataFrame({'x': X, 'y': g.values()[0][:, 3]}) true_data.to_csv('../graph_data/' + name + '_' + 'true_y' + '_data.csv', index=False) plt.figure() color = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] c = 0 check_dir_exists('../graph_data/') graph_data = DataFrame() for k, v in g.iteritems(): # plt.plot(X, v[:, 1], hold=True, color=color[c], label=k) # plt.fill_between(X, v[:, 1] - 2 * np.sqrt(v[:, 2]), v[:, 1] + 2 * np.sqrt(v[:, 2]), alpha=0.2, facecolor=color[c]) graph_data = graph_data.append( DataFrame({ 'x': X, 'm': v[:, 1], 'v': v[:, 2], 'model_sp': [k] * X.shape[0] })) c += 1 plt.legend(loc='upper center') graph_data.to_csv('../graph_data/' + name + '_' + n + '_data.csv', index=False) show(block=True)
blairInsideAll = genfromtxt('blair-inside.tsv', dtype=None, names='time,sid,mid,value') blairOutsideAll = genfromtxt('blair-outside.tsv', dtype=None, names='time,sid,mid,value') filt = lambda b: b[np.logical_and(b['time'] != 0, b['sid'] == 2)] blairOutside = filt(blairOutsideAll) blairInside = filt(blairInsideAll) begin = blairInside['time'][0] end = blairInside['time'][-1] duration = end - begin steps = 300 slices = [(begin + duration * step / steps, begin + duration * (step + 2) / steps) for step in range(0, steps-2)] slicedDates = [begin + (begin - end) / 2 for ( begin, end) in slices] # re-center date in middle of avg window idx = pandas.to_datetime(slicedDates, unit='s', utc=True) df1 = DataFrame({'inside': mean_std(blairInside, slices)[0], 'outside': mean_std(blairOutside, slices)[0]} , index=idx, columns=['inside', 'outside']) df1.plot(kind='line') plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%H:%M', tz=timezone("America/New_York"))) def fill_error_tube(b, color): (mean, error) = mean_std(b, slices) plt.fill_between(df1.index, mean - error, mean + error, color=color) fill_error_tube(blairInside, [0.5, 0.5, 0.5, 0.5]) fill_error_tube(blairOutside, [0.5, 0.5, 0.5, 0.5]) plt.ylabel("Temperatur [Celsius]", fontsize=20) plt.xlabel("Time [Hours]", fontsize=20)