def parse_data(self, response): hxs = HtmlXPathSelector(response) stock_url = "http://stockpage.10jqka.com.cn/" for tr_node in hxs.select("//div/table/tbody/tr"): td_node_list = tr_node.select(".//td") value_list = [] href = "" for index, td_node in enumerate(td_node_list): text = "" if td_node.select('.//a'): text = td_node.select(".//a/text()").extract()[0] href = td_node.select(".//a/@href").extract()[0] else: text = td_node.select(".//text()").extract()[0] value_list.append(text) print value_list stock_code = value_list[1] stock_name = value_list[2] publish_date_str = safestr(value_list[11].replace("-", "")).strip() if publish_date_str == "": continue publish_date = int(publish_date_str) print publish_date if publish_date < self.start_day or publish_date >= self.end_day: continue print safestr(stock_code), safestr(stock_name), safestr(href) yield Request(href, callback=self.parse_stock)
def parse_stock_daily(self, line): try: parts = line.split("=") #print line, parts content = parts[1].strip('"') #print content fields = content.split("~") #print fields if len(fields) < 44: line_str = safestr(line) self.logger.error(format_log("daily_lack_fields", {'line': line_str, 'content': content})) return None # 当日停牌则不能存入 open_price = float(fields[5]) close_price = float(fields[3]) if open_price == 0.0 or close_price == 0.0: return None item = dict() try: item['name'] = safestr(fields[1]) ''' stock_code = fields[2] if self.location == 3: # 美股返回为usWUBA.N code_parts = stock_code.split(".") stock_code = code_parts[0] ''' item['code'] = fields[2] item['sid'] = int(self.datamap['code2id'][item['code']]) item['day'] = self.day item['last_close_price'] = float(fields[4]) item['open_price'] = open_price item['high_price'] = float(fields[33]) item['low_price'] = float(fields[34]) item['close_price'] = close_price # 当前时刻, 格式为HHMMSS item['time'] = fields[30][8:] item['vary_price'] = float(fields[31]) item['vary_portion'] = float(fields[32]) # 成交量转化为手 item['volume'] = int(fields[36]) item['predict_volume'] = get_predict_volume(item['volume'], item['time'], self.location) # 成交额转化为万元 item['amount'] = int(fields[37]) item['exchange_portion'] = fields[38] item['pe'] = fields[39] item['swing'] = fields[43] item['out_capital'] = fields[44] except Exception: self.logger.exception("err=parse_daily_index content=%s", content) return None except Exception: self.logger.exception("err=parse_daily_content line=%s content=%s", line, content) return None return item
def parse_company(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] td_list = hxs.select('//table/tr/td/text()').extract() td_count = len(td_list) if td_count >= 2: item['alias'] = safestr(td_list[1]) if td_count >= 9: item['business'] = safestr(td_list[-1]) #print item return item
def parse_json(self, response): parts = response.body.split("=") content = safestr(parts[1].decode('gbk')) #print content data = json.loads(content) item_list = data['data']['result'] print len(item_list) for info in item_list: item = StockItem() item['location'] = 3 code = info[0] code_parts = code.split(".") if len(code_parts) >= 2: ecode = code_parts[-1] if "N" == ecode: item['ecode'] = "NYSE" elif "OQ" == ecode: item['ecode'] = "NASDAQ" item['name'] = info[2] item['code'] = info[1] stock_url = "http://stockhtm.finance.qq.com/astock/ggcx/" + code + ".htm" #print stock_url request = Request(stock_url, callback=self.parse_data) request.meta['item'] = item yield request
def core(self, item): scode = item url = "http://qt.gtimg.cn/r=" + str(random.random()) + "q=" + scode #print url try: response = urllib2.urlopen(url, timeout=1) content = response.read() except urllib2.HTTPError as e: self.logger.warning("err=get_stock_daily scode=%s code=%s", scode, str(e.code)) return except urllib2.URLError as e: self.logger.warning("err=get_stock_daily scode=%s reason=%s", scode, str(e.reason)) return if content: content = safestr(content.decode('gbk')) #self.logger.info("desc=daily_content content=%s", content) lines = content.strip("\r\n").split(";") for line in lines: if 0 == len(line): continue daily_item = self.parse_stock_daily(line) if daily_item is None: continue # 追加到redis队列中 if self.conn: self.conn.rpush("daily-queue", json.dumps(daily_item)) self.logger.info(format_log("fetch_daily", daily_item))
def name2pinyin(name): input = name.decode('utf-8') letter_list = pinyin(input, 4) #print letter_list output = "".join([ x[0] for x in letter_list]) output = safestr(output) #print safestr(name), safestr(input), output return output
def parse_quotes(self, response): content = safestr(response.body) quotes_data = json.loads(content) for quote_info in quotes_data['quotes']: # 已退市 if 3 == int(quote_info['flag']): print "op=stock_quit code=" + safestr(quote_info['symbol']) + " name=" + safestr(quote_info['name']) continue item = StockItem() item['location'] = 3 item['code'] = quote_info['symbol'] item['name'] = quote_info['name'] stock_name = safestr(quote_info['name']) exchange = safestr(quote_info['exchange']) if exchange == "NASDAQ": item['ecode'] = 4 elif exchange == "NYSE": item['ecode'] = 5 else: # 非nasdaq/nyse的美股忽略 #print quote_info print "op=stock_ignore code=" + safestr(quote_info['symbol']) + " name=" + stock_name + " exchange=" + exchange continue # 总股本 if len(quote_info['totalShares']) > 0: item['out_captial'] = float(quote_info['totalShares']) / 100000000 # 股息 if len(quote_info['dividend']) > 0: item['dividend'] = float(quote_info['dividend']) # 每股净利润 if len(quote_info['eps']) > 0: item['profit'] = float(quote_info['eps']) # 每股净资产 if len(quote_info['net_assets']) > 0: item['assets'] = float(quote_info['net_assets']) #print item yield item
def serialize(self, item): sid = item['sid'] key = "ts-" + str(sid) + "-" + str(item['day']) last_time = 0 ts_map = dict() if sid in self.time_map: last_time = self.time_map[sid] else: self.time_map[sid] = last_time if sid in self.vary_map: ts_map = self.vary_map[sid] else: #TODO: 需要验证ts_map变化后vary_map里的值是否更新 self.vary_map[sid] = ts_map old_last_time = last_time for ts_item in item['items']: ts_item['time'] = int(ts_item['time']) item_time = int(ts_item['time'] / 100) price = ts_item['price'] self.redis_conn.rpush(key, json.dumps(ts_item)) if item_time >= last_time: price_pair = [0.0, float("inf")] if item_time in ts_map: price_pair = ts_map[item_time] if price > price_pair[0]: price_pair[0] = price if price < price_pair[1]: price_pair[1] = price last_time = max(item_time, last_time) ts_map[item_time] = price_pair # 更新last_time和每分钟的成交差异 self.time_map[sid] = last_time self.logger.debug( "desc=refresh_ts sid=%d last_time=%d vary_map=%s", sid, last_time, "|".join( [safestr(k) + "-" + safestr(v) for k, v in ts_map.items()]))
def core(self, item): scode = item cur_timestamp = int(time.time() * 1000) url = "http://hq.sinajs.cn/rn=" + str(cur_timestamp) + "&list=" + scode print url try: response = urllib2.urlopen(url, timeout=5) content = response.read() except urllib2.HTTPError as e: self.logger.warning("err=get_stock_daily scode=%s code=%s", scode, str(e.code)) return except urllib2.URLError as e: self.logger.warning("err=get_stock_daily scode=%s reason=%s", scode, str(e.reason)) return if content: content = safestr(content.decode('gbk')) self.logger.debug("desc=daily_content scode=%s content=%s", scode, content) lines = content.strip("\r\n").split(";") for line in lines: if 0 == len(line): continue daily_item = self.parse_stock_daily(line) if daily_item is None: continue # 追加到redis队列中 json_item = json.dumps(daily_item) if self.conn: self.conn.rpush("daily-queue", json_item) self.logger.info(format_log("fetch_daily", daily_item)) #print format_log("fetch_daily", daily_item) # 设置dump则把数据dump到日志中, 暂定每5mindump一次, 可配置 self.dump(int(daily_item['time'][2:4]), json_item, "daily")
def core(self, item): scode_list = item url = "http://qt.gtimg.cn/r=" + str(random.random()) + "q=" + scode_list print url try: response = urllib2.urlopen(url, timeout=1) content = response.read() except urllib2.HTTPError as e: self.logger.warning("err=get_stock_daily scode_list=%s code=%s", scode_list, str(e.code)) return except urllib2.URLError as e: self.logger.warning("err=get_stock_daily scode_list=%s reason=%s", scode_list, str(e.reason)) return if content: content = safestr(content.decode('gbk')) #self.logger.info("desc=daily_content content=%s", content) lines = content.strip("\r\n").split(";") for line in lines: if 0 == len(line): continue daily_item = self.parse_stock_daily(line) if daily_item is None: continue # 追加到redis队列中 json_data = json.dumps(daily_item) if self.conn: self.conn.rpush("daily-queue", json_data) self.logger.info(format_log("fetch_daily", daily_item)) # 设置dump则把数据dump到日志中, 暂定每5mindump一次, 可配置 self.dump(int(daily_item['time'][2:4]), json_data, "daily")
def parse_stock_daily(self, line): line = line.strip("\r\n") parts = line.split("=") #print line, parts stock_code = parts[0].replace("var hq_str_gb_", "").upper() content = parts[1].strip('"') #print content fields = content.split(",") #print fields if len(fields) < 20: line_str = safestr(line) self.logger.error(format_log("daily_lack_fields", {'line': line_str, 'content': content})) return None # 当日停牌则不能存入 open_price = float(fields[5]) close_price = float(fields[1]) if open_price == 0.0 or close_price == 0.0: return None item = dict() try: item['name'] = safestr(fields[0]) item['code'] = stock_code item['sid'] = int(self.datamap['code2id'][stock_code]) item['day'] = self.day item['last_close_price'] = float(fields[26]) item['open_price'] = open_price item['high_price'] = float(fields[6]) item['low_price'] = float(fields[7]) item['close_price'] = close_price # 当前时刻, 格式为HHMMSS, 这里新浪返回的时间是错的, 改成自己计算, 可能出现不实时的情况 #time_parts = fields[3].split(" ") #item['time'] = time_parts[1].replace(":", "") item['time'] = str(get_timenumber(3)) item['vary_price'] = float(fields[4]) item['vary_portion'] = float(fields[2]) # 成交量单位为股 item['volume'] = int(fields[10]) item['predict_volume'] = get_predict_volume(item['volume'], item['time'], self.location) # 成交额转化为万元 item['amount'] = 0 # 总股本 item['out_capital'] = float(fields[19]) # 总市值 item['cap'] = float(fields[12]) # 计算换手率 if item['out_capital'] > 0: item['exchange_portion'] = item['volume'] / item['out_capital'] * 100 item['swing'] = (item['high_price'] - item['low_price']) / item['last_close_price'] * 100 except Exception as e: traceback.print_exc() self.logger.exception("err=parse_daily_ex code=%s line=%s", stock_code, line) return None if item['out_capital'] > 0: capital = item['out_capital'] / 10000 self.logger.debug("op=update_sql sql={update t_stock set capital=%.2f, out_capital=%.2f where id = %d;}", capital, capital, item['sid']) return item
start_day = int(sys.argv[2]) end_day = int(sys.argv[3]) stock_list = get_stock_list(db_config, 1, 1) vary_list = [] start_hq_data = get_stock_data(db_config, start_day) end_hq_data = get_stock_data(db_config, end_day) for sid in stock_list.keys(): # 忽略其中停牌的股票 if sid not in start_hq_data or sid not in end_hq_data: continue stock_info = stock_list[sid] start_close_price = float(start_hq_data[sid]['close_price']) end_close_price = float(end_hq_data[sid]['close_price']) vary_portion = (start_close_price - end_close_price) / start_close_price * 100 #print format_log("vary_stock", {'sid': sid, 'code': stock_info['code'], 'name': stock_info['name'], 'start_close_price': start_close_price, 'end_close_price': end_close_price, 'vary_portion': vary_portion}) vary_list.append((sid, stock_info['code'], stock_info['name'], start_close_price, end_close_price, vary_portion)) # 按照跌幅的高低排序 vary_list.sort(key=lambda item: item[5], reverse=True) for item in vary_list: #print item str = "\t".join([safestr(v) for v in item]) print str
config_info['REDIS']['port'] = int(config_info['REDIS']['port']) db_config = config_info['DB'] start_day = int(sys.argv[2]) end_day = int(sys.argv[3]) stock_list = get_stock_list(db_config, 1, 1) vary_list = [] start_hq_data = get_stock_data(db_config, start_day) end_hq_data = get_stock_data(db_config, end_day) for sid in stock_list.keys(): # 忽略其中停牌的股票 if sid not in start_hq_data or sid not in end_hq_data: continue stock_info = stock_list[sid] start_close_price = float(start_hq_data[sid]['close_price']) end_close_price = float(end_hq_data[sid]['close_price']) vary_portion = (start_close_price - end_close_price) / start_close_price * 100 #print format_log("vary_stock", {'sid': sid, 'code': stock_info['code'], 'name': stock_info['name'], 'start_close_price': start_close_price, 'end_close_price': end_close_price, 'vary_portion': vary_portion}) vary_list.append((sid, stock_info['code'], stock_info['name'], start_close_price, end_close_price, vary_portion)) # 按照跌幅的高低排序 vary_list.sort(key = lambda item :item[5], reverse=True) for item in vary_list: #print item str = "\t".join([safestr(v) for v in item]) print str
def serialize(self, item): sid = item['sid'] key = "ts-" + str(sid) + "-" + str(item['day']) last_time = 0 ts_map = dict() if sid in self.time_map: last_time = self.time_map[sid] else: self.time_map[sid] = last_time if sid in self.vary_map: ts_map = self.vary_map[sid] else: #TODO: 需要验证ts_map变化后vary_map里的值是否更新 self.vary_map[sid] = ts_map old_last_time = last_time for ts_item in item['items']: ts_item['time'] = int(ts_item['time']) item_time = int(ts_item['time'] / 100) price = ts_item['price'] self.redis_conn.rpush(key, json.dumps(ts_item)) if item_time >= last_time: price_pair = [0.0, float("inf")] if item_time in ts_map: price_pair = ts_map[item_time] if price > price_pair[0]: price_pair[0] = price if price < price_pair[1]: price_pair[1] = price last_time = max(item_time, last_time) ts_map[item_time] = price_pair # 更新last_time和每分钟的成交差异 self.time_map[sid] = last_time self.logger.debug("desc=refresh_ts sid=%d last_time=%d vary_map=%s", sid, last_time, "|".join([ safestr(k) + "-" + safestr(v) for k,v in ts_map.items()]))