def fit(self): w = tf.Variable(tf.zeros([self.x_train.shape[1], self.y_train.shape[1]])) b = tf.Variable(tf.zeros([self.y_train.shape[1]])) activation = tf.nn.softmax(tf.matmul(self.x, w) + b) cost = -tf.reduce_sum(self.y * tf.log(activation)) optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(cost) self.init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(self.init) for epoch in range(self.training_epochs): avg_cost = 0. if self.batch_size == -1: self.batch_size = int(self.x_train.shape[0] / 10) total_batch = int(self.x_train.shape[0] / self.batch_size) for i in range(total_batch): batch_xs = self.x_train[i * self.batch_size: (i + 1) * self.batch_size] batch_ys = self.y_train[i * self.batch_size: (i + 1) * self.batch_size] sess.run(optimizer, feed_dict={self.x: batch_xs, self.y: batch_ys}) avg_cost += sess.run(cost, feed_dict={self.x: batch_xs, self.y: batch_ys}) / total_batch ZLog.info("Optimization Finished!") self.pred = tf.argmax(activation, 1) if self.x_test is not None: correct_prediction = tf.equal(self.pred, tf.argmax(self.y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) ZLog.info("Accuracy:" + str(accuracy.eval({self.x: self.x_test, self.y: self.y_test})))
def gmm_component_filter(self, nc=20, threshold=0.72, show=True): clf = GMM(nc, n_iter=500, random_state=3).fit(self.fiter.y) ss = clf.predict(self.fiter.y) self.fiter.df['p_rk_cg'] = self.fiter.df['profit_cg'].rank() self.fiter.df['ss'] = ss win_top = len(self.fiter.df['profit_cg']) - len(self.fiter.df['profit_cg']) * 0.25 loss_top = len(self.fiter.df['profit_cg']) * 0.25 self.fiter.df['rk'] = 0 self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] > win_top, 1, self.fiter.df['rk']) self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] < loss_top, -1, self.fiter.df['rk']) xt = pd.crosstab(self.fiter.df['ss'], self.fiter.df['rk']) xt_pct = xt.div(xt.sum(1).astype(float), axis=0) if show: xt_pct.plot( figsize=(16, 8), kind='bar', stacked=True, title=str('ss') + ' -> ' + str('result')) plt.xlabel(str('ss')) plt.ylabel(str('result')) ZLog.info(xt_pct[xt_pct[-1] > threshold]) ZLog.info(xt_pct[xt_pct[1] > threshold]) self.top_loss_ss = xt_pct[xt_pct[-1] > threshold].index self.top_win_ss = xt_pct[xt_pct[1] > threshold].index return xt, xt_pct
def judge(self, **kwargs): if not kwargs.has_key('deg_hisWindowPd') \ or not kwargs.has_key('deg_windowPd') \ or not kwargs.has_key('deg_60WindowPd') \ or not kwargs.has_key('lowBkCnt') \ or not kwargs.has_key('wave_score1') \ or not kwargs.has_key('wave_score2') \ or not kwargs.has_key('wave_score3'): ZLog.info('judge dlw kwargs error!') return if not hasattr(self, 'estimator') or not hasattr( self, MlFiterDlwJudgeClass.K_GOLDEN_DEG_PROB): ''' 暂时只info,如果有必要需要raise exception ''' ZLog.info('not estimator or prob') return w = np.array([ kwargs['deg_hisWindowPd'], kwargs['deg_windowPd'], kwargs['deg_60WindowPd'], kwargs['lowBkCnt'], kwargs['wave_score1'], kwargs['wave_score2'], kwargs['wave_score3'] ]) prob_threshold = self.prob_threshold estimator = self.estimator prob = estimator.predict_proba(w.reshape(1, -1))[:, 1][0] if prob > prob_threshold: return True return False
def print_progress(ind, last=False): if last or (ind > 0 and ind % K_PRINT_ITER == 0): ZLog.info('Iteration %d/%d\n' % (ind + 1, iter_n)) ZLog.debug(' content loss: %g\n' % org_loss.eval()) ZLog.debug(' style loss: %g\n' % style_loss.eval()) ZLog.debug(' tv loss: %g\n' % tv_loss.eval()) ZLog.debug(' total loss: %g\n' % loss.eval())
def show_orders_hist(order_pd, s_list=None, q_default=10): if s_list is None: s_list = ['lowBkCnt', 'atr_std', 'jump_power', 'diff_days', 'wave_score1', 'wave_score2', 'wave_score3', 'deg_60WindowPd', 'deg_hisWindowPd', 'deg_windowPd'] s_list = filter(lambda x: order_pd.columns.tolist().count(x) > 0, s_list) for sn in s_list: uq = len(np.unique(order_pd[sn])) if uq == 1: continue bins = 10 bins = uq // 50 if uq // 50 > bins else bins order_pd[sn].hist(bins=bins) plt.show() try: cats = pd.qcut(order_pd[sn], q_default) except Exception: ''' 某一个数据超出q的数量导致无法分 ''' import pandas.core.algorithms as algos bins = algos.quantile(np.unique(order_pd[sn]), np.linspace(0, 1, q_default + 1)) cats = pd.tools.tile._bins_to_cuts(order_pd[sn], bins, include_lowest=True) # ZLog.info(sn + ' qcut except use bins!') ZLog.info('{0} show hist and qcuts'.format(sn)) ZLog.info(cats.value_counts())
def judge(self, **kwargs): if not kwargs.has_key('deg_hisWindowPd') \ or not kwargs.has_key('deg_windowPd') \ or not kwargs.has_key('deg_60WindowPd') \ or not kwargs.has_key('lowBkCnt') \ or not kwargs.has_key('wave_score1') \ or not kwargs.has_key('wave_score2') \ or not kwargs.has_key('wave_score3'): ZLog.info('judge dlw kwargs error!') return if not hasattr(self, 'estimator') or not hasattr(self, MlFiterDlwJudgeClass.K_GOLDEN_DEG_PROB): ''' 暂时只info,如果有必要需要raise exception ''' ZLog.info('not estimator or prob') return w = np.array([kwargs['deg_hisWindowPd'], kwargs['deg_windowPd'], kwargs['deg_60WindowPd'], kwargs['lowBkCnt'], kwargs['wave_score1'], kwargs['wave_score2'], kwargs['wave_score3']]) prob_threshold = self.prob_threshold estimator = self.estimator prob = estimator.predict_proba(w.reshape(1, -1))[:, 1][0] if prob > prob_threshold: return True return False
def gmm_component_filter(self, nc=20, threshold=0.72, show=True): clf = GMM(nc, n_iter=500, random_state=3).fit(self.fiter.y) ss = clf.predict(self.fiter.y) self.fiter.df['p_rk_cg'] = self.fiter.df['profit_cg'].rank() self.fiter.df['ss'] = ss win_top = len(self.fiter.df['profit_cg']) - len( self.fiter.df['profit_cg']) * 0.25 loss_top = len(self.fiter.df['profit_cg']) * 0.25 self.fiter.df['rk'] = 0 self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] > win_top, 1, self.fiter.df['rk']) self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] < loss_top, -1, self.fiter.df['rk']) xt = pd.crosstab(self.fiter.df['ss'], self.fiter.df['rk']) xt_pct = xt.div(xt.sum(1).astype(float), axis=0) if show: xt_pct.plot(figsize=(16, 8), kind='bar', stacked=True, title=str('ss') + ' -> ' + str('result')) plt.xlabel(str('ss')) plt.ylabel(str('result')) ZLog.info(xt_pct[xt_pct[-1] > threshold]) ZLog.info(xt_pct[xt_pct[1] > threshold]) self.top_loss_ss = xt_pct[xt_pct[-1] > threshold].index self.top_win_ss = xt_pct[xt_pct[1] > threshold].index return xt, xt_pct
def show_orders_hist(order_pd, s_list=None, q_default=10): if s_list is None: s_list = [ 'lowBkCnt', 'atr_std', 'jump_power', 'diff_days', 'wave_score1', 'wave_score2', 'wave_score3', 'deg_60WindowPd', 'deg_hisWindowPd', 'deg_windowPd' ] s_list = filter(lambda x: order_pd.columns.tolist().count(x) > 0, s_list) for sn in s_list: uq = len(np.unique(order_pd[sn])) if uq == 1: continue bins = 10 bins = uq // 50 if uq // 50 > bins else bins order_pd[sn].hist(bins=bins) plt.show() try: cats = pd.qcut(order_pd[sn], q_default) except Exception: ''' 某一个数据超出q的数量导致无法分 ''' import pandas.core.algorithms as algos bins = algos.quantile(np.unique(order_pd[sn]), np.linspace(0, 1, q_default + 1)) cats = pd.tools.tile._bins_to_cuts(order_pd[sn], bins, include_lowest=True) # ZLog.info(sn + ' qcut except use bins!') ZLog.info('{0} show hist and qcuts'.format(sn)) ZLog.info(cats.value_counts())
def calc_above(self): orderPd = self.orderPd order_outer_diff_days = orderPd[(orderPd['diff_days'] == 0) | (orderPd['diff_days'] > self.dd_threshold)] self.above = order_outer_diff_days.result.value_counts()[ 1].sum() / order_outer_diff_days.result.value_counts().sum() ZLog.info('above win rate: ' + str(self.above)) return self.above
def calc_below(self): order_diff_days = self.orderPd[(self.orderPd['diff_days'] > 0) & ( self.orderPd['diff_days'] <= self.dd_threshold)] self.below = order_diff_days.result.value_counts()[1].sum( ) / order_diff_days.result.value_counts().sum() ZLog.info('below win rate: ' + str(self.below)) return self.below
def check_golden_mc_result(filter_ret, w_rate, symbol=None, bp=24.3, pf_cnt=200, loop_cnt=20000, p_outter_loop=1): """ 最后可视化验证少量的筛选结果 :param filter_ret: :param w_rate: :param symbol: :param bp: :param pf_cnt: :param loop_cnt: :param p_outter_loop: :return: """ if symbol is None: kl_pd = SymbolPd.make_kfold_pd('usNOAH')[-82:-40] else: kl_pd = SymbolPd.make_kfold_pd(symbol) for loc in np.arange(filter_ret.shape[0]): percents = ast.literal_eval(filter_ret['keys'].iloc[loc]) ZLog.info(loc) ZLog.info(percents) loss_percent = np.array(percents[0:4]) win_percent = np.array(percents[4:9]) profits_dict = {} golden_tuple = namedtuple('golden', ( 'below200', 'below250', 'below300', 'below382', 'above618', 'above700', 'above800', 'above900', 'above950')) _golden_mc_process_cmp(kl_pd, loss_percent, win_percent, golden_tuple, profits_dict, w_rate, bp, pf_cnt, p_outter_loop, loop_cnt, show=True)
def fit_img(self, img_path, resize=False, size=480, enhance=None, iter_n=10, **kwargs): ZLog.info('TensorPrismaClass miss fit_img!!')
def calc_above(self): orderPd = self.orderPd order_outer_diff_days = orderPd[(orderPd['diff_days'] == 0) | ( orderPd['diff_days'] > self.dd_threshold)] self.above = order_outer_diff_days.result.value_counts()[1].sum( ) / order_outer_diff_days.result.value_counts().sum() ZLog.info('above win rate: ' + str(self.above)) return self.above
def predict_kwargs(self, w_col, need_ind_cnt=1, **kwargs): for col in w_col: if col not in kwargs: ZLog.info('judge kwargs error!') return x = np.array([kwargs[col] for col in w_col]) x = x.reshape(1, -1) return self.predict(x, need_ind_cnt) == 1
def predict_hit_kwargs(self, w_col, **kwargs): for col in w_col: if col not in kwargs: ZLog.info('judge kwargs error!') return x = np.array([kwargs[col] for col in w_col]) x = x.reshape(1, -1) return self.hit_cnt(x)
def _do_cross_val_score(self, x, y, cv, scoring): fiter = self.get_fiter() scores = cross_validation.cross_val_score(fiter, x, y, cv=cv, scoring=scoring) mean_sc = np.mean(np.sqrt(-scores)) if scoring == 'mean_squared_error' \ else np.mean(scores) ZLog.info(scoring + ' mean: ' + str(mean_sc)) return scores
def make_boost_dummies(self, orderPd, cats_ss, prefix, regex): try: cats = pd.qcut(cats_ss, self.qcut_bins) except Exception, e: ''' 某一个数据超出q的数量导致无法分 ''' import pandas.core.algorithms as algos bins = algos.quantile(np.unique(cats_ss), np.linspace(0, 1, self.qcut_bins + 1)) cats = pd.tools.tile._bins_to_cuts(cats_ss, bins, include_lowest=True) ZLog.info(prefix + ' qcut except use bins!')
def do_snn_tt(cls, x, y, n_folds=10, nn_hdim=3, num_passes=20000, print_loss=False): kf = KFold(len(y), n_folds=n_folds, shuffle=True) acs = list() for i, (train_index, test_index) in enumerate(kf): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] m_l__t_f = cls(x_train, y_train, x_test, y_test, nn_hdim=nn_hdim, num_passes=num_passes, print_loss=print_loss) accuracy = m_l__t_f.fit() acs.append(accuracy) ZLog.info('accuracys mean = {}'.format(np.array(acs).mean()))
def judge(self, **kwargs): for w in MlFiterDegPd.g_w_col: if w not in kwargs: ZLog.info('judge deg kwargs error!') return regex = MlFiterDegPd.g_regex_d w_col = MlFiterDegPd.g_w_col pd_class = MlFiterDegPdClass return self.do_judge(w_col, regex, pd_class, **kwargs)
def show_golden_process(w_rate, symbol=None, mc_golden=False, bp=24.3, pf_cnt=200, loop_cnt=20000, outter_loop=1): if symbol is None: kl_pd = SymbolPd.make_kfold_pd('usNOAH')[-82:-40] else: kl_pd = SymbolPd.make_kfold_pd(symbol) golden = TLineGolden.calc_mc_golden( kl_pd, g_mc_percent, g_mc_loss_cnt) if mc_golden else TLineGolden.calc_golden(kl_pd) while outter_loop > 0: outter_loop -= 1 profits = [] for _ in np.arange(loop_cnt): wl = init_golden_w_full( golden, bp) if mc_golden else init_golden_wl(golden, bp) if wl is None: ZLog.info('init_golden_wl out of bp range!') return sp = 0 while wl is not None: supports = wl['supports'] resistances = wl['resistances'] w = np.random.binomial(1, w_rate) ''' sp = resistances[-1] sp = supports[0] -1, 0都行,反正这里走到有意义的sp时都是一个了 ''' if w: sp = resistances[-1] else: sp = supports[0] wl = golden_map_wl_grid(w, wl) else: """ -10 默认交易成本(手续费) """ pf = pf_cnt * (sp - bp) - 10 profits.append(pf) profits = pd.Series(profits) show = (outter_loop == 0) NpUtil.calc_regress_ang(profits.cumsum(), show) if show: profits.hist()
def fit(self): weights = { 'h1': tf.Variable(tf.random_normal([self.n_input, self.n_hidden_1])), 'h2': tf.Variable(tf.random_normal([self.n_hidden_1, self.n_hidden_2])), 'out': tf.Variable(tf.random_normal([self.n_hidden_2, self.n_classes])) } biases = { 'b1': tf.Variable(tf.random_normal([self.n_hidden_1])), 'b2': tf.Variable(tf.random_normal([self.n_hidden_2])), 'out': tf.Variable(tf.random_normal([self.n_classes])) } mul_predict = self.multilayer_perceptron(self.x, weights, biases) # Softmax loss cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(mul_predict, self.y)) # Adam Optimizer optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) self.init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(self.init) # Training cycle total_batch = int(len(self.x_train) / self.batch_size) + 1 tf.train.SummaryWriter(K_LOG_FILE, graph=sess.graph) for epoch in range(self.training_epochs): avg_cost = 0. perm = np.arange(len(self.x_train)) np.random.shuffle(perm) self.x_train = self.x_train[perm] self.y_train = self.y_train[perm] for i in range(total_batch): batch_xs = self.x_train[i * self.batch_size: (i + 1) * self.batch_size] batch_ys = self.y_train[i * self.batch_size: (i + 1) * self.batch_size] # Fit training using batch data sess.run(optimizer, feed_dict={self.x: batch_xs, self.y: batch_ys}) # Compute average loss avg_cost += sess.run(cost, feed_dict={self.x: batch_xs, self.y: batch_ys}) / total_batch if epoch % self.display_step == 0: print "Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost) ZLog.info("Optimization Finished!") self.pred = tf.argmax(mul_predict, 1) if self.x_test is not None: correct_prediction = tf.equal(tf.argmax(mul_predict, 1), tf.argmax(self.y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) ac = accuracy.eval({self.x: self.x_test, self.y: self.y_test}) ZLog.info("Accuracy:" + str(ac)) return ac
def _do_collect_work(self): with ThreadPoolExecutor(max_workers=len(self.back_proxys) * 3) as executor: """ 这里使用线程池还是因为代理的质量太差了, 要控制线程数量 """ thread_lock = threading.RLock() all_same_cnt = 0 while True: soup = BeautifulSoup(self.driver.page_source, "lxml") img_objs = soup.select('#imgid > div > ul > li[data-objurl]') sub_same_cnt = 0 for img in img_objs: url = img['data-objurl'] url_thumb = img['data-thumburl'] if self.requested_url.count(url) > 0: sub_same_cnt += 1 continue url_dict = {'url': url, 'url_thumb': url_thumb} if g_enable_debug: self.down_load_img(url_dict, thread_lock) else: executor.submit(self.down_load_img, url_dict, thread_lock) # 就在这里append否则里面还要线程同步 self.requested_url.append(url) js = "window.scrollTo({}, {})".format( self.current_pos, self.current_pos + K_SCROLL_MOVE_DISTANCE) self.current_pos += K_SCROLL_MOVE_DISTANCE self.driver.execute_script(js) time.sleep(K_SCROLL_SLEEP_TIME) """ 所有都在requested中记录全等一次否则重置 """ if sub_same_cnt == len(img_objs): all_same_cnt += 1 else: all_same_cnt = 0 """ 达到一定次数,认为到底部了 """ if all_same_cnt > 30: break if self.collect_cnt >= K_COLLECT_CNT: ZLog.info('collect_cnt > K_COLLECT_CNT task end') break
def choose_cprs_component(self, llps): """ :param llps: cprs[(so.cprs['lps'] < 0) & (so.cprs['lms'] < -0.0)] 你所需要的符合筛选条件的cprs :return: """ if not hasattr(self, 'cprs'): raise ValueError('gmm_component_filter not exe!!!! ') nts_pd = pd.DataFrame() for nk in llps.index: nts_pd = nts_pd.append(self.nts[nk]) nts_pd = nts_pd.drop_duplicates(subset='ind', keep='last') ZLog.info('nts_pd.shape = {0}'.format(nts_pd.shape)) loss_rate = nts_pd.result.value_counts()[0] / nts_pd.result.value_counts().sum() win_rate = nts_pd.result.value_counts()[1] / nts_pd.result.value_counts().sum() ZLog.info('nts_pd loss rate = {0}'.format(loss_rate)) improved = (nts_pd.shape[0] / self.fiter.order_has_ret.shape[0]) * (loss_rate - win_rate) ZLog.info('improved rate = {0}'.format(improved)) xt = self.fiter.order_has_ret.result.value_counts() ZLog.info('predict win rate = ' + str(xt[1] / xt.sum() + improved)) nts_pd.sort_index()['profit'].cumsum().plot() plt.show()
def calc_similar(symbol, cmp_symbol, sc=slice(0, 2), show=True): """ sc: 使用几个维度相似性验证的选择切片 默认使用: E_CORE_TASK_CG_PEARS = 0 E_CORE_TASK_CG_SPERM = 1 如只想使用SPERM sc=slice(1, 2) 对比的股票在rank中的位置分量 """ pd_list = get_pdlist(sc) sum_rank = get_sum_rank(pd_list, symbol) cmp_rank = sum_rank.sort_values(ascending=True).index.tolist().index(cmp_symbol) rank_score = 1 - cmp_rank / sum_rank.shape[0] if show: ZLog.info(symbol + ' similar rank score' + cmp_symbol + ' :' + str(rank_score)) mul_pd = SymbolPd.make_kfold_mulpd([symbol, cmp_symbol]) klpd_symbol = SymbolPd.get_n_year(mul_pd[symbol], from_year=2) klpd_cmp_symbol = SymbolPd.get_n_year(mul_pd[cmp_symbol], from_year=2) """ 缩放到同一个数量级 """ kl_pd_symbol_nrm, klpd_cmp_symbol_nrm = NpUtil.two_mean_list(klpd_symbol.close, klpd_cmp_symbol.close, type_look='look_max') kl_pd_symbol_nrm.plot() klpd_cmp_symbol_nrm.plot() plt.legend([symbol, cmp_symbol]) plt.title('similar draw') plt.show() distance = (kl_pd_symbol_nrm - klpd_cmp_symbol_nrm) distance_mean = distance.mean() distance_std = distance.std() above = distance_mean + distance_std below = distance_mean - distance_std distance.plot() plt.axhline(distance_mean, color='r', linestyle='--') plt.axhline(above, color='c') plt.axhline(below, color='g') plt.title('similar distance') plt.legend(['distance', 'distance_mean', 'distance above', 'distance below'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() return rank_score
def feature_selection(self, **kwargs): x, y = kwargs['x'], kwargs['y'] fiter = self.get_fiter() selector = RFE(fiter) selector.fit(x, y) ZLog.info('RFE selection') ZLog.info( pd.DataFrame( { 'support': selector.support_, 'ranking': selector.ranking_ }, index=self.df.columns[1:])) selector = RFECV(fiter, cv=3, scoring='mean_squared_error') selector.fit(x, y) ZLog.newline() ZLog.info('RFECV selection') ZLog.info( pd.DataFrame( { 'support': selector.support_, 'ranking': selector.ranking_ }, index=self.df.columns[1:]))
def do_thread_work(self, proxy, checked_list, thread_lock): if proxy['type'] == 'HTTP': proxy_dict = dict(http='http://{}'.format(proxy['proxy']), https='http://{}'.format(proxy['proxy'])) else: proxy_dict = dict(http='socks5://{}'.format(proxy['proxy']), https='socks5://{}'.format(proxy['proxy'])) try: # r = requests.post("https://www.baidu.com/", headers=self.headers, proxies=proxy_dict, timeout=15, # verify=False) img_url = 'http://picm.bbzhi.com/dongwubizhi/labuladuoxunhuiquanbizhi/animal_' \ 'labrador_retriever_1600x1200_44243_m.jpg' enable_stream = False if enable_stream: response = requests.get(img_url, headers=self.headers, proxies=proxy_dict, timeout=15, stream=True) if response.status_code == 200: test_name = '../gen/check_proxy.jpg' with open(test_name, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() check_img = PIL.Image.open(test_name) check_img.close() else: response = requests.get(img_url, headers=self.headers, proxies=proxy_dict, timeout=(10, 20)) if response.status_code == 200: test_name = '../gen/check_proxy.jpg' with open(test_name, 'wb') as f: f.write(response.content) f.flush() check_img = PIL.Image.open(test_name) check_img.close() except Exception as e: # ZLog.exception(e) return with thread_lock: ZLog.info('{} check ok'.format(proxy['proxy'])) checked_list.append(proxy)
def judge(self, **kwargs): for w in MlFiterGoldenPd.g_w_col: if w not in kwargs: ZLog.info('judge golden kwargs error!') return regex = MlFiterGoldenPd.g_regex_d """ 要保持和mertics做的pd一样的顺序 """ w_col = MlFiterGoldenPd.g_w_col pd_class = MlFiterGoldenPdClass return self.do_judge(w_col, regex, pd_class, **kwargs)
def scores(self, y_pre, y=None): ZLog.info('scores(self, y_pre, y=None)') _, y = self.proxy_xy(None, y) ZLog.info("accuracy = %.2f" % (accuracy_score(y, y_pre))) ZLog.info("precision_score = %.2f" % (metrics.precision_score(y, y_pre))) ZLog.info("recall_score = %.2f" % (metrics.recall_score(y, y_pre))) self._confusion_matrix_with_report(y, y_pre)
def _confusion_matrix_with_report(self, test_y, predictions): confusion_matrix = metrics.confusion_matrix(test_y, predictions) print("Confusion Matrix ", confusion_matrix) print(" Predicted") print(" | 0 | 1 |") print(" |-----|-----|") print(" 0 | %3d | %3d |" % (confusion_matrix[0, 0], confusion_matrix[0, 1])) print("Actual |-----|-----|") print(" 1 | %3d | %3d |" % (confusion_matrix[1, 0], confusion_matrix[1, 1])) print(" |-----|-----|") ''' 一般情况下loss, win可以概括 ''' ZLog.info(classification_report(test_y, predictions))
def check_golden_mc_result(filter_ret, w_rate, symbol=None, bp=24.3, pf_cnt=200, loop_cnt=20000, p_outter_loop=1): """ 最后可视化验证少量的筛选结果 :param filter_ret: :param w_rate: :param symbol: :param bp: :param pf_cnt: :param loop_cnt: :param p_outter_loop: :return: """ if symbol is None: kl_pd = SymbolPd.make_kfold_pd('usNOAH')[-82:-40] else: kl_pd = SymbolPd.make_kfold_pd(symbol) for loc in np.arange(filter_ret.shape[0]): percents = ast.literal_eval(filter_ret['keys'].iloc[loc]) ZLog.info(loc) ZLog.info(percents) loss_percent = np.array(percents[0:4]) win_percent = np.array(percents[4:9]) profits_dict = {} golden_tuple = namedtuple( 'golden', ('below200', 'below250', 'below300', 'below382', 'above618', 'above700', 'above800', 'above900', 'above950')) _golden_mc_process_cmp(kl_pd, loss_percent, win_percent, golden_tuple, profits_dict, w_rate, bp, pf_cnt, p_outter_loop, loop_cnt, show=True)
def show_golden_process(w_rate, symbol=None, mc_golden=False, bp=24.3, pf_cnt=200, loop_cnt=20000, outter_loop=1): if symbol is None: kl_pd = SymbolPd.make_kfold_pd('usNOAH')[-82:-40] else: kl_pd = SymbolPd.make_kfold_pd(symbol) golden = TLineGolden.calc_mc_golden(kl_pd, g_mc_percent, g_mc_loss_cnt) if mc_golden else TLineGolden.calc_golden( kl_pd) while outter_loop > 0: outter_loop -= 1 profits = [] for _ in np.arange(loop_cnt): wl = init_golden_w_full(golden, bp) if mc_golden else init_golden_wl(golden, bp) if wl is None: ZLog.info('init_golden_wl out of bp range!') return sp = 0 while wl is not None: supports = wl['supports'] resistances = wl['resistances'] w = np.random.binomial(1, w_rate) ''' sp = resistances[-1] sp = supports[0] -1, 0都行,反正这里走到有意义的sp时都是一个了 ''' if w: sp = resistances[-1] else: sp = supports[0] wl = golden_map_wl_grid(w, wl) else: """ -10 默认交易成本(手续费) """ pf = pf_cnt * (sp - bp) - 10 profits.append(pf) profits = pd.Series(profits) show = (outter_loop == 0) NpUtil.calc_regress_ang(profits.cumsum(), show) if show: profits.hist()
def importances_coef_pd(self, **kwargs): if not hasattr(self, 'df'): raise ValueError('please make a df func first!') x, y = kwargs['x'], kwargs['y'] fiter = self.get_fiter() fiter.fit(x, y) self.echo_info(fiter) if hasattr(fiter, 'feature_importances_'): return pd.DataFrame( {'feature': list(self.df.columns)[1:], 'importance': fiter.feature_importances_}).sort_values( 'importance') elif hasattr(fiter, 'coef_'): return pd.DataFrame({"columns": list(self.df.columns)[1:], "coef": list(fiter.coef_.T)}) else: ZLog.info('fiter not hasattr feature_importances_ or coef_!')
def do_judge(self, w_col, regex_dummies, pd_class, **kwargs): if not hasattr(self, 'estimator') or not hasattr(self, 'prob_threshold') \ or not hasattr(self, 'dummies') \ or not hasattr(self, 'invoke_hmm') or not hasattr(self, 'invoke_pca'): ''' 暂时只info,如果有必要需要raise exception ''' ZLog.info('not estimator or prob or dhp') return True w = np.array([kwargs[col] for col in w_col]) w = w.reshape(1, -1) prob_threshold = self.prob_threshold estimator = self.estimator dummies = self.dummies invoke_hmm = self.invoke_hmm invoke_pca = self.invoke_pca df = None if dummies or invoke_hmm: df = pd.DataFrame(w) df.columns = w_col if dummies and df is not None: df_dummies = pd_class.dummies_xy(df) regex = regex_dummies df = df_dummies.filter(regex=regex) w = df.as_matrix() if invoke_hmm: ''' 只是置换出hmm形式的x值,这里的df没有修改,暂时也没有必要 ''' w = pd_class.hmm_predict(self, w).reshape(1, -1) elif invoke_pca: ''' elif 互斥 ''' w = pd_class.pca_predict(self, w).reshape(1, -1) prob = estimator.predict_proba(w)[:, 1][0] if prob > prob_threshold: return True return False
def plot_confusion_matrices(estimator, x, y, n_folds=10): y_pred = run_cv_estimator(estimator, x, y, n_folds=n_folds) class_names = np.unique(y).tolist() confusion_matrix = metrics.confusion_matrix(y, y_pred) ZLog.info(confusion_matrix) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(confusion_matrix) plt.title('Confusion matrix for %s' % estimator.__class__.__name__) fig.colorbar(cax) ax.set_xticklabels([''] + class_names) ax.set_yticklabels([''] + class_names) plt.xlabel('Predicted') plt.ylabel('True') plt.show()
def check_proxy(self): checked_list = list() thread_lock = threading.RLock() thread_array = [] for proxy in self.proxy_list: # self.do_thread_work(proxy, checked_list, thread_lock) t = threading.Thread(target=self.do_thread_work, args=( proxy, checked_list, thread_lock,)) t.setDaemon(True) t.start() thread_array.append(t) for t in thread_array: t.join() self.proxy_list = checked_list ZLog.info('proxy_list len={}'.format(len(self.proxy_list)))
def do_tf_tt(cls, x, y, n_folds=10, **kwargs): """ 如果需要扩张除init四个外的参数, 子类继续自己扩张把 :param x: :param y: :param n_folds: :return: """ kf = KFold(len(y), n_folds=n_folds, shuffle=True) acs = list() for i, (train_index, test_index) in enumerate(kf): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] m_l__t_f = cls(x_train, y_train, x_test, y_test, **kwargs) ac = m_l__t_f.fit() if ac is not None: acs.append(ac) if len(acs) > 0: ZLog.info('acs mean = {}'.format(np.array(acs).mean()))
def judge(self, **kwargs): for w in MlFiterJumpPd.g_w_col: if w not in kwargs: ZLog.info('judge golden kwargs error!') return regex = MlFiterJumpPd.g_regex_d """ 要保持和mertics做的pd一样的顺序 """ w_col = MlFiterJumpPd.g_w_col pd_class = MlFiterJumpPdClass if not hasattr(self, 'dd_threshold'): ZLog.info('not dd_threshold') return True return self.do_judge(w_col, regex, pd_class, **kwargs)
def graphviz_tree(estimator, features, x, y): if not hasattr(estimator, 'tree_'): ZLog.info('only tree can graphviz!') return estimator.fit(x, y) tree.export_graphviz(estimator.tree_, out_file='graphviz.dot', feature_names=features) os.system("dot -T png graphviz.dot -o graphviz.png") ''' !open $path 要是方便用notebook直接open其实显示效果好,plt,show的大小不好调整 ''' # path = ZEnv.shell_cmd_result('pwd') + '/graphviz.png' # !open $path image_file = cbook.get_sample_data(ZEnv.shell_cmd_result('pwd') + '/graphviz.png') image = plt.imread(image_file) plt.imshow(image) plt.axis('off') # clear x- and y-axes plt.show()
def importances_coef_pd(self, **kwargs): if not hasattr(self, 'df'): raise ValueError('please make a df func first!') x, y = kwargs['x'], kwargs['y'] fiter = self.get_fiter() fiter.fit(x, y) self.echo_info(fiter) if hasattr(fiter, 'feature_importances_'): return pd.DataFrame({ 'feature': list(self.df.columns)[1:], 'importance': fiter.feature_importances_ }).sort_values('importance') elif hasattr(fiter, 'coef_'): return pd.DataFrame({ "columns": list(self.df.columns)[1:], "coef": list(fiter.coef_.T) }) else: ZLog.info('fiter not hasattr feature_importances_ or coef_!')
def show_general(self, use_fiter=False): order_has_ret_fit = self.fiter.order_has_ret if use_fiter else self.orders_pd[self.orders_pd['result'] <> 0] ZLog.info('all fit order = ' + str(order_has_ret_fit.shape)) xt = order_has_ret_fit.result.value_counts() ZLog.info('win rate = ' + str(xt[1] / xt.sum())) ZLog.info('profit_cg.sum() = ' + str(order_has_ret_fit.profit_cg.sum())) order_has_ret_fit.sort_values('buy Date')['profit_cg'].cumsum().plot(grid=True, title='profit_cg cumsum') profit_cg_win_mean = order_has_ret_fit[order_has_ret_fit['profit_cg'] > 0].profit_cg.mean() profit_cg_loss_mean = order_has_ret_fit[order_has_ret_fit['profit_cg'] < 0].profit_cg.mean() ZLog.info('win mean = {0} loss_mean = {1} '.format(profit_cg_win_mean, profit_cg_loss_mean)) plt.show()
def feature_selection(self, **kwargs): x, y = kwargs['x'], kwargs['y'] fiter = self.get_fiter() selector = RFE(fiter) selector.fit(x, y) ZLog.info('RFE selection') ZLog.info(pd.DataFrame({'support': selector.support_, 'ranking': selector.ranking_}, index=self.df.columns[1:])) selector = RFECV(fiter, cv=3, scoring='mean_squared_error') selector.fit(x, y) ZLog.newline() ZLog.info('RFECV selection') ZLog.info(pd.DataFrame({'support': selector.support_, 'ranking': selector.ranking_}, index=self.df.columns[1:]))
def train_test_split_df(self, df=None, test_size=0.1, random_state=0): if df is None: df = self.df train_df, cv_df = train_test_split(df, test_size=test_size, random_state=random_state) fiter = self.get_fiter() fiter.fit(train_df.as_matrix()[:, 1:], train_df.as_matrix()[:, 0]) predictions = fiter.predict(cv_df.as_matrix()[:, 1:]) ZLog.info("accuracy = %.2f" % (accuracy_score(cv_df.as_matrix()[:, 0], predictions))) ZLog.info("precision_score = %.2f" % (metrics.precision_score(cv_df.as_matrix()[:, 0], predictions))) ZLog.info("recall_score = %.2f" % (metrics.recall_score(cv_df.as_matrix()[:, 0], predictions))) self._confusion_matrix_with_report(cv_df.as_matrix()[:, 0], predictions)
def verify_process(est_cls, judge_cls, make_x_func, make_order_func, order_pd, only_jd=False, first_local=False, tn_threshold=800): """ :param est_cls: :param judge_cls: :param make_x_func: :param make_order_func: :param order_pd: :param only_jd: 使用以序列化的只进行judge :param first_local: 优先使用本地分类器 :param tn_threshold: :return: """ if not only_jd: _, _, _, _, _, _ = est_cls.dump_process(judge_cls, order_pd, tn_threshold, True, first_local=first_local) def apply_judge(order, p_make_x_func): x = p_make_x_func(order) """ 离散不使用隐因子 """ d_ret = est_cls.do_predict_process(judge_cls, True, False, False, **x) """ 连续不使用隐因子 """ v_ret = est_cls.do_predict_process(judge_cls, False, False, False, **x) """ 离散使用隐因子 """ dm_ret = est_cls.do_predict_process(judge_cls, True, True, False, **x) """ 连续使用隐因子 """ vm_ret = est_cls.do_predict_process(judge_cls, False, True, False, **x) """ 离散使用pca """ dp_ret = est_cls.do_predict_process(judge_cls, True, False, True, **x) """ 连续使用pca """ vp_ret = est_cls.do_predict_process(judge_cls, False, False, True, **x) return d_ret, v_ret, dm_ret, vm_ret, dp_ret, vp_ret order_has_ret = make_order_func(order_pd) jd_ret = order_pd.apply(apply_judge, axis=1, args=(make_x_func,)) order_has_ret['d_ret'] = [1 if ret[0] else 0 for ret in jd_ret] order_has_ret['v_ret'] = [1 if ret[1] else 0 for ret in jd_ret] order_has_ret['dm_ret'] = [1 if ret[2] else 0 for ret in jd_ret] order_has_ret['vm_ret'] = [1 if ret[3] else 0 for ret in jd_ret] order_has_ret['dp_ret'] = [1 if ret[4] else 0 for ret in jd_ret] order_has_ret['vp_ret'] = [1 if ret[5] else 0 for ret in jd_ret] v_ret_result = metrics.accuracy_score(order_has_ret[order_has_ret['v_ret'] == 0]['result'], order_has_ret[order_has_ret['v_ret'] == 0]['v_ret']) ZLog.info('v_ret_result: ' + str(v_ret_result)) d_ret_result = metrics.accuracy_score(order_has_ret[order_has_ret['d_ret'] == 0]['result'], order_has_ret[order_has_ret['d_ret'] == 0]['d_ret']) ZLog.info('d_ret_result: ' + str(d_ret_result)) dp_ret_result = metrics.accuracy_score(order_has_ret[order_has_ret['dp_ret'] == 0]['result'], order_has_ret[order_has_ret['dp_ret'] == 0]['dp_ret']) ZLog.info('dp_ret_result: ' + str(dp_ret_result)) vp_ret_result = metrics.accuracy_score(order_has_ret[order_has_ret['vp_ret'] == 0]['result'], order_has_ret[order_has_ret['vp_ret'] == 0]['vp_ret']) ZLog.info('vp_ret_result: ' + str(vp_ret_result)) dm_ret_result = metrics.accuracy_score(order_has_ret[order_has_ret['dm_ret'] == 0]['result'], order_has_ret[order_has_ret['dm_ret'] == 0]['dm_ret']) ZLog.info('dm_ret_result: ' + str(dm_ret_result)) vm_ret_result = metrics.accuracy_score(order_has_ret[order_has_ret['vm_ret'] == 0]['result'], order_has_ret[order_has_ret['vm_ret'] == 0]['vm_ret']) ZLog.info('vm_ret_result: ' + str(vm_ret_result)) ZLog.newline(fill_cnt=58) v_ret_result_all = metrics.accuracy_score(order_has_ret['result'], order_has_ret['v_ret']) ZLog.info('v_ret_result_all: ' + str(v_ret_result_all)) d_ret_result_all = metrics.accuracy_score(order_has_ret['result'], order_has_ret['d_ret']) ZLog.info('d_ret_result_all: ' + str(d_ret_result_all)) dp_ret_result_all = metrics.accuracy_score(order_has_ret['result'], order_has_ret['dp_ret']) ZLog.info('dp_ret_result_all: ' + str(dp_ret_result_all)) vp_ret_result_all = metrics.accuracy_score(order_has_ret['result'], order_has_ret['vp_ret']) ZLog.info('vp_ret_result_all: ' + str(vp_ret_result_all)) dm_ret_result_all = metrics.accuracy_score(order_has_ret['result'], order_has_ret['dm_ret']) ZLog.info('dm_ret_result_all: ' + str(dm_ret_result_all)) vm_ret_result_all = metrics.accuracy_score(order_has_ret['result'], order_has_ret['vm_ret']) ZLog.info('vm_ret_result_all: ' + str(vm_ret_result_all)) ZLog.newline(fill_cnt=58) order_has_ret['vdmret'] = order_has_ret['d_ret'] + order_has_ret['v_ret'] + order_has_ret['dp_ret'] + order_has_ret[ 'vp_ret'] order_has_ret['vdmret'].value_counts().plot(kind='barh') plt.title('vdmret barh') plt.show() ((order_has_ret['vdmret'] == 1) & (order_has_ret['v_ret'] == 1)).value_counts().plot(kind='bar') plt.title('v_ret == 1') plt.show() ((order_has_ret['vdmret'] == 1) & (order_has_ret['d_ret'] == 1)).value_counts().plot(kind='bar') plt.title('d_ret == 1') plt.show() ((order_has_ret['vdmret'] == 1) & (order_has_ret['vm_ret'] == 1)).value_counts().plot(kind='bar') plt.title('vm_ret == 1') plt.show() ((order_has_ret['vdmret'] == 1) & (order_has_ret['dm_ret'] == 1)).value_counts().plot(kind='bar') plt.title('dm_ret == 1') plt.show() ((order_has_ret['vdmret'] == 1) & (order_has_ret['dp_ret'] == 1)).value_counts().plot(kind='bar') plt.title('dp_ret == 1') plt.show() ((order_has_ret['vdmret'] == 1) & (order_has_ret['vp_ret'] == 1)).value_counts().plot(kind='bar') plt.title('vp_ret == 1') plt.show() final_result = metrics.accuracy_score(order_has_ret[order_has_ret['vdmret'] == 0]['result'], order_has_ret[order_has_ret['vdmret'] == 0]['vdmret']) ZLog.info('final_result: ' + str(final_result)) order_has_ret['vdmret_one'] = np.where(order_has_ret['vdmret'] == 1, 0, 1) final_one_result = metrics.accuracy_score(order_has_ret[order_has_ret['vdmret_one'] == 0]['result'], order_has_ret[order_has_ret['vdmret_one'] == 0]['vdmret_one']) ZLog.info('final_one_result: ' + str(final_one_result)) order_has_ret['vdmret_two'] = np.where(order_has_ret['vdmret'] == 2, 0, 1) final_two_result = metrics.accuracy_score(order_has_ret[order_has_ret['vdmret_two'] == 0]['result'], order_has_ret[order_has_ret['vdmret_two'] == 0]['vdmret_two']) ZLog.info('final_two_result: ' + str(final_two_result)) order_has_ret['vdmret_three'] = np.where(order_has_ret['vdmret'] == 3, 0, 1) final_three_result = metrics.accuracy_score(order_has_ret[order_has_ret['vdmret_three'] == 0]['result'], order_has_ret[order_has_ret['vdmret_three'] == 0]['vdmret_three']) ZLog.info('final_three_result: ' + str(final_three_result)) order_has_ret['vdmret_four'] = np.where(order_has_ret['vdmret'] == 4, 0, 1) final_four_result = metrics.accuracy_score(order_has_ret[order_has_ret['vdmret_four'] == 0]['result'], order_has_ret[order_has_ret['vdmret_four'] == 0]['vdmret_four']) ZLog.info('final_four_result: ' + str(final_four_result)) return jd_ret, order_has_ret
def echo_info(self, fiter=None): if fiter is None: fiter = self.get_fiter() ZLog.info(format(fiter.__class__.__name__, '*^58s'))
def train_test_split_xy(self, x=None, y=None, test_size=0.1, random_state=0): x, y = self.proxy_xy(x, y) train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=test_size, random_state=random_state) ZLog.info(x.shape, y.shape) ZLog.info(train_x.shape, train_y.shape) ZLog.info(test_x.shape, test_y.shape) fiter = self.get_fiter() clf = fiter.fit(train_x, train_y) predictions = clf.predict(test_x) ZLog.info("accuracy = %.2f" % (accuracy_score(test_y, predictions))) ZLog.info("precision_score = %.2f" % (metrics.precision_score(test_y, predictions))) ZLog.info("recall_score = %.2f" % (metrics.recall_score(test_y, predictions))) self._confusion_matrix_with_report(test_y, predictions)
def plot_roc_estimator(self, x=None, y=None): x, y = self.proxy_xy(x, y) fiter = self.get_fiter() ZLog.info(fiter.__class__.__name__ + ' :roc') MlFiterExcute.plot_roc_estimator(fiter, x, y)