Esempio n. 1
0
    def parse_data(self, response):
        hxs = HtmlXPathSelector(response)
        stock_url = "http://stockpage.10jqka.com.cn/"

        for tr_node in hxs.select("//div/table/tbody/tr"):
            td_node_list = tr_node.select(".//td")
            value_list = []
            href = ""

            for index, td_node in enumerate(td_node_list):
                text = ""
                if td_node.select('.//a'):
                    text = td_node.select(".//a/text()").extract()[0]
                    href = td_node.select(".//a/@href").extract()[0]
                else:
                    text = td_node.select(".//text()").extract()[0]
                value_list.append(text)

            print value_list
            stock_code = value_list[1]
            stock_name = value_list[2]
            publish_date_str = safestr(value_list[11].replace("-", "")).strip()
            if publish_date_str == "":
                continue

            publish_date = int(publish_date_str)
            print publish_date
            if publish_date < self.start_day or publish_date >= self.end_day:
                continue

            print safestr(stock_code), safestr(stock_name), safestr(href)
            yield Request(href, callback=self.parse_stock)
Esempio n. 2
0
    def parse_stock_daily(self, line):
        try:
            parts = line.split("=")
            #print line, parts
            content = parts[1].strip('"')
            #print content

            fields = content.split("~")
            #print fields
            if len(fields) < 44:
                line_str = safestr(line)
                self.logger.error(format_log("daily_lack_fields", {'line': line_str, 'content': content}))
                return None

            # 当日停牌则不能存入
            open_price = float(fields[5])
            close_price = float(fields[3])
            if open_price == 0.0 or close_price == 0.0:
                return None

            item = dict()

            try:
                item['name'] = safestr(fields[1])
                '''
                stock_code = fields[2]
                if self.location == 3: # 美股返回为usWUBA.N
                    code_parts = stock_code.split(".")
                    stock_code = code_parts[0]
                '''
                item['code'] = fields[2]
                item['sid'] = int(self.datamap['code2id'][item['code']])
                item['day'] = self.day
                item['last_close_price'] = float(fields[4])
                item['open_price'] = open_price
                item['high_price'] = float(fields[33])
                item['low_price'] = float(fields[34])
                item['close_price'] = close_price
                # 当前时刻, 格式为HHMMSS
                item['time'] = fields[30][8:]
                item['vary_price'] = float(fields[31])
                item['vary_portion'] = float(fields[32])
                # 成交量转化为手
                item['volume'] = int(fields[36])
                item['predict_volume'] = get_predict_volume(item['volume'], item['time'], self.location)
                # 成交额转化为万元
                item['amount'] = int(fields[37])
                item['exchange_portion'] = fields[38]
                item['pe'] = fields[39]
                item['swing'] = fields[43]
                item['out_capital'] = fields[44]
            except Exception:
                self.logger.exception("err=parse_daily_index content=%s", content)
                return None
        except Exception:
            self.logger.exception("err=parse_daily_content line=%s content=%s", line, content)
            return None

        return item
Esempio n. 3
0
    def parse_company(self, response):
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']

        td_list = hxs.select('//table/tr/td/text()').extract()
        td_count = len(td_list)
        if td_count >= 2:
            item['alias'] = safestr(td_list[1])
        if td_count >= 9:
            item['business'] = safestr(td_list[-1])

        #print item
        return item
Esempio n. 4
0
    def parse_json(self, response):
        parts = response.body.split("=")
        content = safestr(parts[1].decode('gbk'))
        #print content

        data = json.loads(content)
        item_list = data['data']['result']
        print len(item_list)

        for info in item_list:
            item = StockItem()
            item['location'] = 3

            code = info[0]   
            code_parts = code.split(".")
            if len(code_parts) >= 2:
                ecode = code_parts[-1]
                if "N" == ecode:
                    item['ecode'] = "NYSE"
                elif "OQ" == ecode:
                    item['ecode'] = "NASDAQ" 

            item['name'] = info[2]
            item['code'] = info[1]

            stock_url = "http://stockhtm.finance.qq.com/astock/ggcx/" + code + ".htm"
            #print stock_url
            request = Request(stock_url, callback=self.parse_data)
            request.meta['item'] = item
            yield request
Esempio n. 5
0
    def core(self, item):
        scode = item
        url = "http://qt.gtimg.cn/r=" + str(random.random()) + "q=" + scode
        #print url

        try:
            response = urllib2.urlopen(url, timeout=1)
            content = response.read()
        except urllib2.HTTPError as e:
            self.logger.warning("err=get_stock_daily scode=%s code=%s", scode,
                                str(e.code))
            return
        except urllib2.URLError as e:
            self.logger.warning("err=get_stock_daily scode=%s reason=%s",
                                scode, str(e.reason))
            return

        if content:
            content = safestr(content.decode('gbk'))
            #self.logger.info("desc=daily_content content=%s", content)
            lines = content.strip("\r\n").split(";")

            for line in lines:
                if 0 == len(line):
                    continue

                daily_item = self.parse_stock_daily(line)
                if daily_item is None:
                    continue

                # 追加到redis队列中
                if self.conn:
                    self.conn.rpush("daily-queue", json.dumps(daily_item))
                self.logger.info(format_log("fetch_daily", daily_item))
Esempio n. 6
0
def name2pinyin(name):
    input = name.decode('utf-8')

    letter_list = pinyin(input, 4)
    #print letter_list
    output = "".join([ x[0] for x in letter_list])
    output = safestr(output)
    #print safestr(name), safestr(input), output

    return output
Esempio n. 7
0
    def parse_quotes(self, response):
        content = safestr(response.body)
        quotes_data = json.loads(content)

        for quote_info in quotes_data['quotes']:
            # 已退市
            if 3 == int(quote_info['flag']):
                print "op=stock_quit code=" + safestr(quote_info['symbol'])  + " name=" + safestr(quote_info['name'])
                continue

            item = StockItem()
            item['location'] = 3

            item['code'] = quote_info['symbol']
            item['name'] = quote_info['name']
            stock_name = safestr(quote_info['name'])
            exchange = safestr(quote_info['exchange'])

            if exchange == "NASDAQ":
                item['ecode'] = 4
            elif exchange == "NYSE":
                item['ecode'] = 5
            else:   # 非nasdaq/nyse的美股忽略
                #print quote_info
                print "op=stock_ignore code=" + safestr(quote_info['symbol']) + " name=" + stock_name + " exchange=" + exchange
                continue

            # 总股本 
            if len(quote_info['totalShares']) > 0:
                item['out_captial'] = float(quote_info['totalShares']) / 100000000
            # 股息
            if len(quote_info['dividend']) > 0:
                item['dividend'] = float(quote_info['dividend'])   
            # 每股净利润
            if len(quote_info['eps']) > 0:
                item['profit'] = float(quote_info['eps'])
            # 每股净资产
            if len(quote_info['net_assets']) > 0:
                item['assets'] = float(quote_info['net_assets'])

            #print item
            yield item    
Esempio n. 8
0
    def serialize(self, item):
        sid = item['sid']
        key = "ts-" + str(sid) + "-" + str(item['day'])

        last_time = 0
        ts_map = dict()
        if sid in self.time_map:
            last_time = self.time_map[sid]
        else:
            self.time_map[sid] = last_time

        if sid in self.vary_map:
            ts_map = self.vary_map[sid]
        else:  #TODO: 需要验证ts_map变化后vary_map里的值是否更新
            self.vary_map[sid] = ts_map

        old_last_time = last_time
        for ts_item in item['items']:
            ts_item['time'] = int(ts_item['time'])
            item_time = int(ts_item['time'] / 100)
            price = ts_item['price']
            self.redis_conn.rpush(key, json.dumps(ts_item))

            if item_time >= last_time:
                price_pair = [0.0, float("inf")]
                if item_time in ts_map:
                    price_pair = ts_map[item_time]

                if price > price_pair[0]:
                    price_pair[0] = price
                if price < price_pair[1]:
                    price_pair[1] = price

                last_time = max(item_time, last_time)
                ts_map[item_time] = price_pair

        # 更新last_time和每分钟的成交差异
        self.time_map[sid] = last_time
        self.logger.debug(
            "desc=refresh_ts sid=%d last_time=%d vary_map=%s", sid, last_time,
            "|".join(
                [safestr(k) + "-" + safestr(v) for k, v in ts_map.items()]))
Esempio n. 9
0
    def core(self, item):
        scode = item
        cur_timestamp = int(time.time() * 1000)
        url = "http://hq.sinajs.cn/rn=" +  str(cur_timestamp) + "&list=" + scode
        print url

        try:
            response = urllib2.urlopen(url, timeout=5)
            content = response.read()
        except urllib2.HTTPError as e:
            self.logger.warning("err=get_stock_daily scode=%s code=%s", scode, str(e.code))
            return 
        except urllib2.URLError as e:
            self.logger.warning("err=get_stock_daily scode=%s reason=%s", scode, str(e.reason))
            return

        if content:
            content = safestr(content.decode('gbk'))
            self.logger.debug("desc=daily_content scode=%s content=%s", scode, content)
            lines = content.strip("\r\n").split(";")

            for line in lines:
                if 0 == len(line):
                    continue

                daily_item = self.parse_stock_daily(line)
                if daily_item is None:
                    continue

                # 追加到redis队列中
                json_item = json.dumps(daily_item)
                if self.conn:
                    self.conn.rpush("daily-queue", json_item)
                self.logger.info(format_log("fetch_daily", daily_item))
                #print format_log("fetch_daily", daily_item)

                # 设置dump则把数据dump到日志中, 暂定每5mindump一次, 可配置
                self.dump(int(daily_item['time'][2:4]), json_item, "daily")
Esempio n. 10
0
    def core(self, item):
        scode_list = item
        url = "http://qt.gtimg.cn/r=" + str(random.random()) + "q=" + scode_list
        print url

        try:
            response = urllib2.urlopen(url, timeout=1)
            content = response.read()
        except urllib2.HTTPError as e:
            self.logger.warning("err=get_stock_daily scode_list=%s code=%s", scode_list, str(e.code))
            return 
        except urllib2.URLError as e:
            self.logger.warning("err=get_stock_daily scode_list=%s reason=%s", scode_list, str(e.reason))
            return

        if content:
            content = safestr(content.decode('gbk'))
            #self.logger.info("desc=daily_content content=%s", content)
            lines = content.strip("\r\n").split(";")

            for line in lines:
                if 0 == len(line):
                    continue

                daily_item = self.parse_stock_daily(line)
                if daily_item is None:
                    continue

                # 追加到redis队列中
                json_data = json.dumps(daily_item)
                if self.conn:
                    self.conn.rpush("daily-queue", json_data)
                self.logger.info(format_log("fetch_daily", daily_item))
                
                # 设置dump则把数据dump到日志中, 暂定每5mindump一次, 可配置
                self.dump(int(daily_item['time'][2:4]), json_data, "daily")
Esempio n. 11
0
    def parse_stock_daily(self, line):
        line = line.strip("\r\n")
        parts = line.split("=")
        #print line, parts
        stock_code = parts[0].replace("var hq_str_gb_", "").upper()
        content = parts[1].strip('"')
        #print content

        fields = content.split(",")
        #print fields
        if len(fields) < 20:
            line_str = safestr(line)
            self.logger.error(format_log("daily_lack_fields", {'line': line_str, 'content': content}))
            return None

        # 当日停牌则不能存入
        open_price = float(fields[5])
        close_price = float(fields[1])
        if open_price == 0.0 or close_price == 0.0:
            return None

        item = dict()

        try:
            item['name'] = safestr(fields[0])
            item['code'] = stock_code
            item['sid'] = int(self.datamap['code2id'][stock_code])
            item['day'] = self.day
            item['last_close_price'] = float(fields[26])
            item['open_price'] = open_price
            item['high_price'] = float(fields[6])
            item['low_price'] = float(fields[7])
            item['close_price'] = close_price

            # 当前时刻, 格式为HHMMSS, 这里新浪返回的时间是错的, 改成自己计算, 可能出现不实时的情况
            #time_parts = fields[3].split(" ")
            #item['time'] = time_parts[1].replace(":", "")
            item['time'] = str(get_timenumber(3))

            item['vary_price'] = float(fields[4])
            item['vary_portion'] = float(fields[2])
            # 成交量单位为股
            item['volume'] = int(fields[10])
            item['predict_volume'] = get_predict_volume(item['volume'], item['time'], self.location)
            # 成交额转化为万元
            item['amount'] = 0
            # 总股本
            item['out_capital'] = float(fields[19])
            # 总市值
            item['cap'] = float(fields[12])
            # 计算换手率
            if item['out_capital'] > 0:
                item['exchange_portion'] = item['volume'] / item['out_capital'] * 100
            item['swing'] = (item['high_price'] - item['low_price']) / item['last_close_price'] * 100
        except Exception as e:
            traceback.print_exc() 
            self.logger.exception("err=parse_daily_ex code=%s line=%s", stock_code, line)
            return None

        if item['out_capital'] > 0:
            capital = item['out_capital'] / 10000
            self.logger.debug("op=update_sql sql={update t_stock set capital=%.2f, out_capital=%.2f where id = %d;}", capital, capital, item['sid'])
        return item
Esempio n. 12
0
    start_day = int(sys.argv[2])
    end_day = int(sys.argv[3])

    stock_list = get_stock_list(db_config, 1, 1)
    vary_list = []
    start_hq_data = get_stock_data(db_config, start_day)
    end_hq_data = get_stock_data(db_config, end_day)

    for sid in stock_list.keys():
        # 忽略其中停牌的股票
        if sid not in start_hq_data or sid not in end_hq_data:
            continue

        stock_info = stock_list[sid]
        start_close_price = float(start_hq_data[sid]['close_price'])
        end_close_price = float(end_hq_data[sid]['close_price'])
        vary_portion = (start_close_price -
                        end_close_price) / start_close_price * 100

        #print format_log("vary_stock", {'sid': sid, 'code': stock_info['code'], 'name': stock_info['name'], 'start_close_price': start_close_price, 'end_close_price': end_close_price, 'vary_portion': vary_portion})
        vary_list.append((sid, stock_info['code'], stock_info['name'],
                          start_close_price, end_close_price, vary_portion))

    #  按照跌幅的高低排序
    vary_list.sort(key=lambda item: item[5], reverse=True)
    for item in vary_list:
        #print item
        str = "\t".join([safestr(v) for v in item])
        print str
Esempio n. 13
0
    config_info['REDIS']['port'] = int(config_info['REDIS']['port'])
    db_config = config_info['DB']

    start_day = int(sys.argv[2])
    end_day = int(sys.argv[3])

    stock_list = get_stock_list(db_config, 1, 1)
    vary_list = []
    start_hq_data = get_stock_data(db_config, start_day)
    end_hq_data = get_stock_data(db_config, end_day)

    for sid in stock_list.keys():
        # 忽略其中停牌的股票
        if sid not in start_hq_data or sid not in end_hq_data:
            continue

        stock_info = stock_list[sid]
        start_close_price = float(start_hq_data[sid]['close_price'])
        end_close_price = float(end_hq_data[sid]['close_price'])
        vary_portion = (start_close_price - end_close_price) / start_close_price * 100

        #print format_log("vary_stock", {'sid': sid, 'code': stock_info['code'], 'name': stock_info['name'], 'start_close_price': start_close_price, 'end_close_price': end_close_price, 'vary_portion': vary_portion})
        vary_list.append((sid, stock_info['code'], stock_info['name'], start_close_price, end_close_price, vary_portion))

    #  按照跌幅的高低排序
    vary_list.sort(key = lambda item :item[5], reverse=True)
    for item in vary_list:
        #print item
        str = "\t".join([safestr(v) for v in item])
        print str
Esempio n. 14
0
    def serialize(self, item):
        sid = item['sid']
        key = "ts-" + str(sid) + "-" + str(item['day'])

        last_time = 0
        ts_map = dict()
        if sid in self.time_map:
            last_time = self.time_map[sid]
        else:
            self.time_map[sid] = last_time

        if sid in self.vary_map:
            ts_map = self.vary_map[sid]
        else: #TODO: 需要验证ts_map变化后vary_map里的值是否更新
            self.vary_map[sid] = ts_map

        old_last_time = last_time
        for ts_item in item['items']:
            ts_item['time'] = int(ts_item['time'])
            item_time = int(ts_item['time'] / 100)
            price = ts_item['price']
            self.redis_conn.rpush(key, json.dumps(ts_item))

            if item_time >= last_time:
                price_pair = [0.0, float("inf")]
                if item_time in ts_map:
                    price_pair = ts_map[item_time]

                if price > price_pair[0]:
                    price_pair[0] = price
                if price < price_pair[1]:
                    price_pair[1] = price

                last_time = max(item_time, last_time)
                ts_map[item_time] = price_pair

         # 更新last_time和每分钟的成交差异
        self.time_map[sid] = last_time
        self.logger.debug("desc=refresh_ts sid=%d last_time=%d vary_map=%s", sid, last_time, "|".join([ safestr(k) + "-" + safestr(v) for k,v in ts_map.items()]))