class RIndexIndustryInfo: def __init__(self, dbinfo=ct.DB_INFO, redis_host=None): self.redis = create_redis_obj( ) if redis_host is None else create_redis_obj(host=redis_host) self.dbname = self.get_dbname() self.logger = getLogger(__name__) self.mysql_client = CMySQL(dbinfo, self.dbname, iredis=self.redis) if not self.mysql_client.create_db(self.get_dbname()): raise Exception("init rindex stock database failed") @staticmethod def get_dbname(): return ct.RINDEX_INDUSTRY_INFO_DB def get_table_name(self, cdate): cdates = cdate.split('-') return "rindustry_day_%s_%s" % (cdates[0], (int(cdates[1]) - 1) // 3 + 1) def is_date_exists(self, table_name, cdate): if self.redis.exists(table_name): return cdate in set( str(tdate, encoding="utf8") for tdate in self.redis.smembers(table_name)) return False def is_table_exists(self, table_name): if self.redis.exists(self.dbname): return table_name in set( str(table, encoding="utf8") for table in self.redis.smembers(self.dbname)) return False def create_table(self, table): sql = 'create table if not exists %s(date varchar(10) not null,\ code varchar(10) not null,\ open float,\ high float,\ close float,\ preclose float,\ low float,\ volume bigint,\ amount float,\ preamount float,\ pchange float,\ mchange float,\ PRIMARY KEY (date, code))' % table return True if table in self.mysql_client.get_all_tables( ) else self.mysql_client.create(sql, table) def get_k_data_in_range(self, start_date, end_date): ndays = delta_days(start_date, end_date) date_dmy_format = time.strftime("%m/%d/%Y", time.strptime(start_date, "%Y-%m-%d")) data_times = pd.date_range(date_dmy_format, periods=ndays, freq='D') date_only_array = np.vectorize(lambda s: s.strftime('%Y-%m-%d'))( data_times.to_pydatetime()) data_dict = OrderedDict() for _date in date_only_array: if CCalendar.is_trading_day(_date, redis=self.redis): table_name = self.get_table_name(_date) if table_name not in data_dict: data_dict[table_name] = list() data_dict[table_name].append(str(_date)) all_df = pd.DataFrame() for key in data_dict: table_list = sorted(data_dict[key], reverse=False) if len(table_list) == 1: df = self.get_data(table_list[0]) if df is not None: all_df = all_df.append(df) else: start_date = table_list[0] end_date = table_list[len(table_list) - 1] df = self.get_data_between(start_date, end_date) if df is not None: all_df = all_df.append(df) return all_df def get_data_between(self, start_date, end_date): #start_date and end_date shoulw be in the same table sql = "select * from %s where date between \"%s\" and \"%s\"" % ( self.get_table_name(start_date), start_date, end_date) return self.mysql_client.get(sql) def get_k_data(self, cdate): cdate = datetime.now().strftime('%Y-%m-%d') if cdate is None else cdate sql = "select * from %s where date=\"%s\"" % ( self.get_table_name(cdate), cdate) return self.mysql_client.get(sql) def get_industry_data(self, cdate, code): return (code, CIndex(code).get_k_data(cdate)) def generate_data(self, cdate): good_list = list() obj_pool = Pool(500) all_df = pd.DataFrame() industry_info = IndustryInfo.get(self.redis) failed_list = industry_info.code.tolist() cfunc = partial(self.get_industry_data, cdate) failed_count = 0 while len(failed_list) > 0: is_failed = False self.logger.debug("restart failed ip len(%s)" % len(failed_list)) for code_data in obj_pool.imap_unordered(cfunc, failed_list): if code_data[1] is not None: tem_df = code_data[1] tem_df['code'] = code_data[0] all_df = all_df.append(tem_df) failed_list.remove(code_data[0]) else: is_failed = True if is_failed: failed_count += 1 if failed_count > 10: self.logger.info("%s rindustry init failed" % failed_list) return pd.DataFrame() time.sleep(10) obj_pool.join(timeout=5) obj_pool.kill() self.mysql_client.changedb(self.get_dbname()) if all_df.empty: return all_df all_df = all_df.reset_index(drop=True) return all_df def set_data(self, cdate=datetime.now().strftime('%Y-%m-%d')): if not CCalendar.is_trading_day(cdate, redis=self.redis): return False table_name = self.get_table_name(cdate) if not self.is_table_exists(table_name): if not self.create_table(table_name): self.logger.error("create rindex table failed") return False self.redis.sadd(self.dbname, table_name) if self.is_date_exists(table_name, cdate): self.logger.debug("existed rindex table:%s, date:%s" % (table_name, cdate)) return True df = self.generate_data(cdate) if df.empty: return False self.redis.set(ct.TODAY_ALL_INDUSTRY, _pickle.dumps(df, 2)) if self.mysql_client.set(df, table_name): return self.redis.sadd(table_name, cdate) return False def update(self, end_date=None, num=10): if end_date is None: end_date = datetime.now().strftime('%Y-%m-%d') #if end_date == datetime.now().strftime('%Y-%m-%d'): end_date = get_day_nday_ago(end_date, num = 1, dformat = "%Y-%m-%d") start_date = get_day_nday_ago(end_date, num=num, dformat="%Y-%m-%d") date_array = get_dates_array(start_date, end_date) succeed = True for mdate in date_array: if CCalendar.is_trading_day(mdate, redis=self.redis): if not self.set_data(mdate): self.logger.error("%s rindustry set failed" % mdate) succeed = False return succeed
class CReivew: def __init__(self, dbinfo): self.dbinfo = dbinfo self.sdir = '/data/docs/blog/hellobiek.github.io/source/_posts' self.doc = CDoc(self.sdir) self.stock_objs = dict() self.redis = create_redis_obj() self.mysql_client = CMySQL(self.dbinfo, iredis=self.redis) self.cal_client = ccalendar.CCalendar(without_init=True) self.animating = False self.emotion_table = ct.EMOTION_TABLE if not self.create_emotion(): raise Exception("create emotion table failed") def create_emotion(self): if self.emotion_table not in self.mysql_client.get_all_tables(): sql = 'create table if not exists %s(date varchar(10) not null, score float, PRIMARY KEY (date))' % self.emotion_table if not self.mysql_client.create(sql, self.emotion_table): return False return True def get_today_all_stock_data(self, _date): df_byte = self.redis.get(ct.TODAY_ALL_STOCK) if df_byte is None: return None df = _pickle.loads(df_byte) return df[df.date == _date] def get_industry_data(self, _date): df = pd.DataFrame() df_info = IndustryInfo.get() for _, code in df_info.code.iteritems(): data = CIndex(code).get_k_data(date=_date) df = df.append(data) df = df.reset_index(drop=True) df['name'] = df_info['name'] df = df.sort_values(by='amount', ascending=False) df = df.reset_index(drop=True) return df def emotion_plot(self, dir_name): sql = "select * from %s" % self.emotion_table df = self.mysql_client.get(sql) fig = plt.figure() x = df.date.tolist() xn = range(len(x)) y = df.score.tolist() plt.plot(xn, y) for xi, yi in zip(xn, y): plt.plot((xi, ), (yi, ), 'ro') plt.text(xi, yi, '%s' % yi) plt.scatter(xn, y, label='score', color='k', s=25, marker="o") plt.xticks(xn, x) plt.xlabel('时间', fontproperties=get_chinese_font()) plt.ylabel('分数', fontproperties=get_chinese_font()) plt.title('股市情绪', fontproperties=get_chinese_font()) fig.autofmt_xdate() plt.savefig('%s/emotion.png' % dir_name, dpi=1000) def industry_plot(self, dir_name, industry_info): #colors = ['#F5DEB3', '#A0522D', '#1E90FF', '#FFE4C4', '#00FFFF', '#DAA520', '#3CB371', '#808080', '#ADFF2F', '#4B0082', '#ADD8E6'] colors = [ '#F5DEB3', '#A0522D', '#1E90FF', '#FFE4C4', '#00FFFF', '#DAA520', '#3CB371', '#808080', '#ADFF2F', '#4B0082' ] industry_info.amount = industry_info.amount / 10000000000 total_amount = industry_info.amount.sum() amount_list = industry_info[0:10].amount.tolist() x = date.fromtimestamp(time.time()) fig = plt.figure() base_line = 0 for i in range(len(amount_list)): label_name = "%s:%s" % (industry_info.loc[i]['name'], 100 * amount_list[i] / total_amount) plt.bar(x, amount_list[i], width=0.1, color=colors[i], bottom=base_line, align='center', label=label_name) base_line += amount_list[i] plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) plt.gca().xaxis.set_major_locator(mdates.DayLocator()) plt.xlabel('x轴', fontproperties=get_chinese_font()) plt.ylabel('y轴', fontproperties=get_chinese_font()) plt.title('市值分布', fontproperties=get_chinese_font()) fig.autofmt_xdate() plt.legend(loc='upper right', prop=get_chinese_font()) plt.savefig('%s/industry.png' % dir_name, dpi=1000) def get_limitup_data(self, date): return CLimit(self.dbinfo).get_data(date) def gen_market_emotion_score(self, stock_info, limit_info): limit_up_list = limit_info[(limit_info.pchange > 0) & (limit_info.prange != 0)].reset_index( drop=True).code.tolist() limit_down_list = limit_info[limit_info.pchange < 0].reset_index( drop=True).code.tolist() limit_up_list.extend(limit_down_list) total = 0 for _index, pchange in stock_info.changepercent.iteritems(): code = str(stock_info.loc[_index, 'code']).zfill(6) if code in limit_up_list: total += 2 * pchange else: total += pchange aver = total / len(stock_info) data = { 'date': ["%s" % datetime.now().strftime('%Y-%m-%d')], 'score': [aver] } df = pd.DataFrame.from_dict(data) if not self.mysql_client.set(df, self.emotion_table): raise Exception("set data to emotion failed") def static_plot(self, dir_name, stock_info, limit_info): colors = ['b', 'r', 'y', 'g', 'm'] limit_up_list = limit_info[(limit_info.pchange > 0) & (limit_info.prange != 0)].reset_index( drop=True).code.tolist() limit_down_list = limit_info[limit_info.pchange < 0].reset_index( drop=True).code.tolist() limit_list = limit_up_list + limit_down_list changepercent_list = [9, 7, 5, 3, 1, 0, -1, -3, -5, -7, -9] num_list = list() name_list = list() num_list.append(len(limit_up_list)) name_list.append("涨停") c_length = len(changepercent_list) for _index in range(c_length): pchange = changepercent_list[_index] if 0 == _index: num_list.append( len(stock_info[(stock_info.changepercent > pchange) & ( stock_info.loc[_index, 'code'] not in limit_list)])) name_list.append(">%s" % pchange) elif c_length - 1 == _index: num_list.append( len(stock_info[(stock_info.changepercent < pchange) & ( stock_info.loc[_index, 'code'] not in limit_list)])) name_list.append("<%s" % pchange) else: p_max_change = changepercent_list[_index - 1] num_list.append( len(stock_info[(stock_info.changepercent > pchange) & (stock_info.changepercent < p_max_change)])) name_list.append("%s-%s" % (pchange, p_max_change)) num_list.append(len(limit_down_list)) name_list.append("跌停") fig = plt.figure() for i in range(len(num_list)): plt.bar(i + 1, num_list[i], color=colors[i % len(colors)], width=0.3) plt.text(i + 1, 15 + num_list[i], num_list[i], ha='center', font_properties=get_chinese_font()) plt.xlabel('x轴', fontproperties=get_chinese_font()) plt.ylabel('y轴', fontproperties=get_chinese_font()) plt.title('涨跌分布', fontproperties=get_chinese_font()) plt.xticks(range(1, len(num_list) + 1), name_list, fontproperties=get_chinese_font()) fig.autofmt_xdate() plt.savefig('%s/static.png' % dir_name, dpi=1000) def is_collecting_time(self): now_time = datetime.now() _date = now_time.strftime('%Y-%m-%d') y, m, d = time.strptime(_date, "%Y-%m-%d")[0:3] mor_open_hour, mor_open_minute, mor_open_second = (21, 0, 0) mor_open_time = datetime(y, m, d, mor_open_hour, mor_open_minute, mor_open_second) mor_close_hour, mor_close_minute, mor_close_second = (23, 59, 59) mor_close_time = datetime(y, m, d, mor_close_hour, mor_close_minute, mor_close_second) return mor_open_time < now_time < mor_close_time def get_index_data(self, _date): df = pd.DataFrame() for code, name in ct.TDX_INDEX_DICT.items(): self.mysql_client.changedb(CIndex.get_dbname(code)) data = self.mysql_client.get( "select * from day where date=\"%s\";" % _date) data['name'] = name df = df.append(data) self.mysql_client.changedb() return df def update(self): _date = datetime.now().strftime('%Y-%m-%d') dir_name = os.path.join(self.sdir, "%s-StockReView" % _date) try: if not os.path.exists(dir_name): logger.info("create daily info") #stock analysis stock_info = self.get_today_all_stock_data(_date) #get volume > 0 stock list stock_info = stock_info[stock_info.volume > 0] stock_info = stock_info.reset_index(drop=True) os.makedirs(dir_name, exist_ok=True) #industry analysis industry_info = self.get_industry_data(_date) #index and total analysis index_info = self.get_index_data(_date) index_info = index_info.reset_index(drop=True) #limit up and down analysis limit_info = self.get_limitup_data(_date) #emotion analysis self.gen_market_emotion_score(stock_info, limit_info) self.emotion_plot(dir_name) #static analysis self.static_plot(dir_name, stock_info, limit_info) #gen review file self.doc.generate(stock_info, industry_info, index_info) #gen review animation self.gen_animation() except Exception as e: logger.error(e) def gen_animation(self, sfile=None): style.use('fivethirtyeight') Writer = animation.writers['ffmpeg'] writer = Writer(fps=1, metadata=dict(artist='biek'), bitrate=1800) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) _today = datetime.now().strftime('%Y-%m-%d') cdata = self.mysql_client.get('select * from %s where date = "%s"' % (ct.ANIMATION_INFO, _today)) if cdata is None: return None cdata = cdata.reset_index(drop=True) ctime_list = cdata.time.unique() name_list = cdata.name.unique() ctime_list = [ datetime.strptime(ctime, '%H:%M:%S') for ctime in ctime_list ] frame_num = len(ctime_list) if 0 == frame_num: return None def animate(i): ax.clear() ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S')) ax.xaxis.set_major_locator(mdates.DayLocator()) ax.set_title('盯盘', fontproperties=get_chinese_font()) ax.set_xlabel('时间', fontproperties=get_chinese_font()) ax.set_ylabel('增长', fontproperties=get_chinese_font()) ax.set_ylim((-6, 6)) fig.autofmt_xdate() for name in name_list: pchange_list = list() price_list = cdata[cdata.name == name]['price'].tolist() pchange_list.append(0) for _index in range(1, len(price_list)): pchange_list.append( 10 * (price_list[_index] - price_list[_index - 1]) / price_list[0]) ax.plot(ctime_list[0:i], pchange_list[0:i], label=name, linewidth=1.5) if pchange_list[i - 1] > 1 or pchange_list[i - 1] < -1: ax.text(ctime_list[i - 1], pchange_list[i - 1], name, font_properties=get_chinese_font()) ani = animation.FuncAnimation(fig, animate, frame_num, interval=60000, repeat=False) sfile = '/data/animation/%s_animation.mp4' % _today if sfile is None else sfile ani.save(sfile, writer) plt.close(fig) def get_range_data(self, start_date, end_date, code): sql = "select * from day where date between \"%s\" and \"%s\"" % ( start_date, end_date) self.mysql_client.changedb(CStock.get_dbname(code)) return (code, self.mysql_client.get(sql)) def gen_stocks_trends(self, start_date, end_date, stock_info, max_length): good_list = list() obj_pool = Pool(500) all_df = pd.DataFrame() failed_list = stock_info.code.tolist() cfunc = partial(self.get_range_data, start_date, end_date) while len(failed_list) > 0: logger.info("restart failed ip len(%s)" % len(failed_list)) for code_data in obj_pool.imap_unordered(cfunc, failed_list): if code_data[1] is not None: tem_df = code_data[1] if len(tem_df) == max_length: tem_df = tem_df.sort_values(by='date', ascending=True) tem_df['code'] = code_data[0] tem_df['preclose'] = tem_df['close'].shift(1) tem_df = tem_df[tem_df.date != start_date] all_df = all_df.append(tem_df) failed_list.remove(code_data[0]) obj_pool.join(timeout=5) obj_pool.kill() self.mysql_client.changedb() return all_df def relation_plot(self, df, good_list): close_price_list = [ df[df.code == code].close.tolist() for code in good_list ] close_prices = np.vstack(close_price_list) open_price_list = [ df[df.code == code].open.tolist() for code in good_list ] open_prices = np.vstack(open_price_list) # the daily variations of the quotes are what carry most information variation = (close_prices - open_prices) * 100 / open_prices logger.info("get variation succeed") # ############################################################################# # learn a graphical structure from the correlations edge_model = covariance.GraphLassoCV() # standardize the time series: using correlations rather than covariance is more efficient for structure recovery X = variation.copy().T X /= X.std(axis=0) edge_model.fit(X) logger.info("mode compute succeed") # ############################################################################# # cluster using affinity propagation _, labels = cluster.affinity_propagation(edge_model.covariance_) n_labels = labels.max() code_list = np.array(good_list) industry_dict = dict() industry_df_info = IndustryInfo.get() for index, name in industry_df_info.name.iteritems(): content = industry_df_info.loc[index]['content'] a_code_list = json.loads(content) for code in a_code_list: industry_dict[code] = name cluster_dict = dict() for i in range(n_labels + 1): cluster_dict[i] = code_list[labels == i] name_list = [ CStockInfo.get(code, 'name') for code in code_list[labels == i] ] logger.info('cluster code %i: %s' % ((i + 1), ', '.join(name_list))) cluster_info = dict() for group, _code_list in cluster_dict.items(): for code in _code_list: iname = industry_dict[code] if group not in cluster_info: cluster_info[group] = set() cluster_info[group].add(iname) logger.info('cluster inustry %i: %s' % ((i + 1), ', '.join(list(cluster_info[group])))) # ############################################################################# # find a low-dimension embedding for visualization: find the best position of # the nodes (the stocks) on a 2D plane # we use a dense eigen_solver to achieve reproducibility (arpack is # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6) embedding = node_position_model.fit_transform(X.T).T # ############################################################################# # visualizatio plt.figure(1, facecolor='w', figsize=(10, 8)) plt.clf() ax = plt.axes([0., 0., 1., 1.]) plt.axis('off') # display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) # plot the nodes using the coordinates of our embedding plt.scatter(embedding[0], embedding[1], s=100 * d**2, c=labels, cmap=plt.cm.nipy_spectral) # plot the edges start_idx, end_idx = np.where(non_zero) # a sequence of (*line0*, *line1*, *line2*), where:: linen = (x0, y0), (x1, y1), ... (xm, ym) segments = [[embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)] values = np.abs(partial_correlations[non_zero]) lc = LineCollection(segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, .7 * values.max())) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # add a label to each node. The challenge here is that we want to position the labels to avoid overlap with other labels for index, (name, label, (x, y)) in enumerate(zip(code_list, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 dy = y - embedding[1] dy[index] = 1 this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: horizontalalignment = 'left' x = x + .002 else: horizontalalignment = 'right' x = x - .002 if this_dy > 0: verticalalignment = 'bottom' y = y + .002 else: verticalalignment = 'top' y = y - .002 plt.text(x, y, name, size=10, horizontalalignment=horizontalalignment, verticalalignment=verticalalignment, bbox=dict(facecolor='w', edgecolor=plt.cm.nipy_spectral(label / float(n_labels)), alpha=.6)) plt.xlim( embedding[0].min() - .15 * embedding[0].ptp(), embedding[0].max() + .10 * embedding[0].ptp(), ) plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), embedding[1].max() + .03 * embedding[1].ptp()) plt.savefig('/tmp/relation.png', dpi=1000) def plot_price_series(self, df, ts1, ts2): fig = plt.figure() x = df.loc[df.code == ts1].date.tolist() xn = range(len(x)) y1 = df.loc[df.code == ts1].close.tolist() name1 = df[df.code == ts1].name.values[0] name2 = df[df.code == ts2].name.values[0] y2 = df.loc[df.code == ts2].close.tolist() plt.plot(xn, y1, label=name1, linewidth=1.5) plt.plot(xn, y2, label=name2, linewidth=1.5) plt.xticks(xn, x) plt.xlabel('时间', fontproperties=get_chinese_font()) plt.ylabel('分数', fontproperties=get_chinese_font()) plt.title('协整关系', fontproperties=get_chinese_font()) fig.autofmt_xdate() plt.legend(loc='upper right', prop=get_chinese_font()) plt.savefig('/tmp/relation/%s_%s.png' % (ts1, ts2), dpi=1000) plt.close(fig)