コード例 #1
0
ファイル: JqkaSpider.py プロジェクト: minhpascal/stockcat
    def parse_stock(self, response):
        hxs = HtmlXPathSelector(response)
        #log.msg(response.body)
        item = StockItem()

        #print hxs.select('//div/h1')
        #print hxs.select('//div/h1/a/text()').extract()
        #print hxs.select('//div/h1/a/strong/text()').extract()
        item['name'] = hxs.select('//div/h1/a/strong/text()').extract()[0]
        item['code'] = hxs.select('//div/h1/a/text()').extract()[1].strip(
            " \t\n")
        #print item

        company_node = hxs.select('//dl[contains(@class, "company_details")]')
        strong_list = company_node.select('.//dd/strong/text()').extract()
        #print strong_list

        item['captial'] = float(strong_list[0])
        item['out_captial'] = float(strong_list[1])
        item['profit'] = float(strong_list[4])
        item['assets'] = float(strong_list[5])
        #print item

        company_url = "http://stockpage.10jqka.com.cn/" + item[
            'code'] + "/company/"
        request = Request(company_url, callback=self.parse_company)
        request.meta['item'] = item
        yield request
コード例 #2
0
    def parse_json(self, response):
        parts = response.body.split("=")
        content = safestr(parts[1].decode('gbk'))
        #print content

        data = json.loads(content)
        item_list = data['data']['result']
        print len(item_list)

        for info in item_list:
            item = StockItem()
            item['location'] = 3

            code = info[0]   
            code_parts = code.split(".")
            if len(code_parts) >= 2:
                ecode = code_parts[-1]
                if "N" == ecode:
                    item['ecode'] = "NYSE"
                elif "OQ" == ecode:
                    item['ecode'] = "NASDAQ" 

            item['name'] = info[2]
            item['code'] = info[1]

            stock_url = "http://stockhtm.finance.qq.com/astock/ggcx/" + code + ".htm"
            #print stock_url
            request = Request(stock_url, callback=self.parse_data)
            request.meta['item'] = item
            yield request
コード例 #3
0
    def parse(self, response):

        #for sel in response.xpath('//div[@id="base_info"]//ul//li'):
        #        print sel.extract()
        #       #例如获取股息率 市盈率
        #        item = StockItem()
        #        yield item
        res_td = r'<td>(.*?)</td>'
        for sel in response.xpath('//table[@id="financial_analysis"]//tr'):
            self.index = 0
            item = StockItem()
            link = response.url
            item["link"] = link
            item["num"] = link.split("/")[-1][:-5]
            for trinfo in sel.xpath(
                    'td[not(contains(@class, "showRedTips"))]'):
                self.index += 1
                if self.index == 6:
                    line = trinfo.extract()
                    m_td = re.findall(res_td, line, re.S | re.M)
                    item["peratio"] = m_td
                elif self.index == 10:
                    line = trinfo.extract()
                    m_td = re.findall(res_td, line, re.S | re.M)
                    item["roe"] = m_td
                elif self.index == 11:
                    line = trinfo.extract()
                    m_td = re.findall(res_td, line, re.S | re.M)
                    item["dividend"] = m_td
            if self.index == 11:
                yield item
            else:
                continue
コード例 #4
0
    def parse_page(self, response):
        try:
            item = StockItem()

            # 股票名
            item['name'] = response.xpath(
                '//a[@class="bets-name"]/text()').extract()[0].strip()[:4]
            # 收盘价
            item['close_price'] = response.xpath(
                '//div[@class="line2"]//dl[1]/dd/text()').extract()[0]
            # 开盘价
            item['open_price'] = response.xpath(
                '//div[@class="line1"]//dl[1]/dd/text()').extract()[0]
            # 当前价
            item['cur_price'] = response.xpath('//strong/text()').extract()[0]
            # 最高价
            item['highest_price'] = response.xpath(
                '//div[@class="line1"]/dl[3]/dd/text()').extract()[0]
            # 最低价
            item['lowest_price'] = response.xpath(
                '//div[@class="line2"]/dl[3]/dd/text()').extract()[0]
            # 成交量
            item['volume'] = response.xpath(
                '//div[@class="line1"]/dl[2]/dd/text()').extract()[0]
            # 换手率
            item['change_rate'] = response.xpath(
                '//div[@class="line2"]/dl[2]/dd/text()').extract()[0]

            yield item
        except:
            pass
コード例 #5
0
    def handlePage(self, response):
        print("处理列表页面")
        trs = response.xpath("//tr")
        #open("test.html" , "wb+").writelines(trs).close()
        
        for tr in trs[1:]:
            # 股票代码 简称
            number = tr.xpath(".//a[@target='_blank']")[0].xpath('text()').extract()[0]
            name = tr.xpath(".//a[@target='_blank']")[1].xpath('text()').extract()[0]
            # 股票价格
            price = tr.xpath("./td[4]").xpath('text()').extract()[0]
            # 股票涨幅
            gains = tr.xpath("./td[5]").xpath('text()').extract()[0]
            # 股票换手率
            rate = tr.xpath("./td[8]").xpath('text()').extract()[0]
            # 股票量比
            thanCarrie = tr.xpath("./td[9]").xpath('text()').extract()[0]
            # 股票振幅
            amplitude = tr.xpath("./td[10]").xpath('text()').extract()[0]

            print(name)

            stock = StockItem()

            stock['stock_id'] = number
            stock['stock_name'] = name
            stock['stock_price'] = price
            stock['stock_gains'] = gains
            stock['stock_rate'] = rate
            stock['stock_thanCarrie'] = thanCarrie
            stock['stock_amplitude'] = amplitude
            stock['today'] = time.strftime("%Y-%m-%d", time.localtime())

            yield stock
コード例 #6
0
    def parse(self, response):
        item = StockItem()
        stock_codes = response.xpath('//*[@id="quotesearch"]/ul[1]/li')

        for title in stock_codes:
            item['stock_code'] = title.xpath('.//a[@target="_blank"]/text()').extract()

            yield item
コード例 #7
0
ファイル: stockmysql.py プロジェクト: fc2009/papa
    def down_gu(self, response):
        stock_item = StockItem()
        # table = []
        print(response.url)
        tr_list = response.xpath('//table[@class="m-table"]/tbody/tr')
        for td in tr_list:
            content_list = td.xpath('./td/text()').extract()
            content_list[1] = content_list[1].strip()
            # print(content_list)
            stock_item['xuhao'] = content_list[0]
            stock_item['jysj'] = content_list[1]
            stock_item['rz_ye'] = self.str2num(content_list[2])
            stock_item['rz_mre'] = self.str2num(content_list[3])
            stock_item['rz_che'] = self.str2num(content_list[4])
            stock_item['rz_rzjmr'] = self.str2num(content_list[5])
            stock_item['rq_ye'] = self.str2num(content_list[6])
            stock_item['rq_mre'] = self.str2num(content_list[7])
            stock_item['rq_che'] = self.str2num(content_list[8])
            stock_item['rq_rzjmr'] = self.str2num(content_list[9])
            stock_item['rzrqye'] = self.str2num(content_list[10])
            print(stock_item)
            yield stock_item

            # print('】,【'.join(content_list))
        #     table.append('|,|'.join(content_list))
        #     print(response.url)
        #     print(table)
        # with open('C:\\Users\\Administrator\\Desktop\\papa\\stock\\gupiao\\'+response.meta["gu_name"]+'.txt', 'a') as f:
        #     f.write('\n'.join(table)+'\n')
        #     f.close()
        # f.write('\n'.join(table)+'\n')
        # 还有很多页,是动态加载的,这里我们只取前三页。
        if response.meta['index'] > 3:
            return

        response.meta['index'] += 1
        # http://data.10jqka.com.cn/market/rzrqgg/code/000725/order/desc/page/2/ajax/ 1 /
        page_url = 'http://data.10jqka.com.cn/market/rzrqgg/3/ajax/code/' + str(
            response.meta['gu_hao']) + '/desc/page/order/desc/page/' + str(
                response.meta['index']) + '/ajax/1/'
        yield scrapy.Request(url=page_url,
                             callback=self.down_gu,
                             meta={
                                 "gu_name": response.meta['gu_name'],
                                 "gu_hao": response.meta['gu_hao'],
                                 "index": response.meta['index'],
                             })
コード例 #8
0
    def parse(self, response):
        url = 'https://smart.tdcc.com.tw/opendata/getOD.ashx?id=1-5'
        data = pd.read_csv(url)

        self.copy2Hist()  # 跑前,將資料搬到歷史區
        self.clearTable()  # 清除目前的要運算的表
        # csv = data[data['資料日期'] == 15][:1]
        for index, row in data.iterrows():
            item = StockItem()
            item['data_date'] = row[0]  #資料日期
            item['stock_no'] = row[1].zfill(6)  #證券代號
            item['stock_num'] = row[4]  #持股分級
            item['level'] = row[2]  #人數
            item['holder_num'] = row[3]  #股數
            item['percent'] = row[5]  #占集保庫存數比例%

            yield item
コード例 #9
0
    def parse(self, response):

        td = datetime.date.today().strftime('%Y%m%d')
        f = open("holding_data/" + td + ".csv", "w")
        f.write("stock_id ;stock_name; holding_num; holding_perc \n")
        for sel in response.xpath("//tr").extract():

            item = StockItem()
            info = Selector(text=sel).xpath("//td/text()").extract()
            if len(info) != 4:
                continue

            a = info[0].split("\r\n")[1]
            b = info[1].split("\r\n")[1]
            c = info[2].split("\r\n")[1]
            d = info[3].split("\r\n")[1]

            f.write(a + ";" + b + ";" + c + ";" + d + "\n")
        f.close()
コード例 #10
0
    def parse_quotes(self, response):
        content = safestr(response.body)
        quotes_data = json.loads(content)

        for quote_info in quotes_data['quotes']:
            # 已退市
            if 3 == int(quote_info['flag']):
                print "op=stock_quit code=" + safestr(quote_info['symbol'])  + " name=" + safestr(quote_info['name'])
                continue

            item = StockItem()
            item['location'] = 3

            item['code'] = quote_info['symbol']
            item['name'] = quote_info['name']
            stock_name = safestr(quote_info['name'])
            exchange = safestr(quote_info['exchange'])

            if exchange == "NASDAQ":
                item['ecode'] = 4
            elif exchange == "NYSE":
                item['ecode'] = 5
            else:   # 非nasdaq/nyse的美股忽略
                #print quote_info
                print "op=stock_ignore code=" + safestr(quote_info['symbol']) + " name=" + stock_name + " exchange=" + exchange
                continue

            # 总股本 
            if len(quote_info['totalShares']) > 0:
                item['out_captial'] = float(quote_info['totalShares']) / 100000000
            # 股息
            if len(quote_info['dividend']) > 0:
                item['dividend'] = float(quote_info['dividend'])   
            # 每股净利润
            if len(quote_info['eps']) > 0:
                item['profit'] = float(quote_info['eps'])
            # 每股净资产
            if len(quote_info['net_assets']) > 0:
                item['assets'] = float(quote_info['net_assets'])

            #print item
            yield item    
コード例 #11
0
ファイル: ths.py プロジェクト: czj21212/stock-2
 def parse(self, response):
     print(response.url)
     div_list = response.xpath("//div[contains(@class,'category')]")  # 大分类列表
     for div in div_list[1:2]:
         item = StockItem()
         item["b_cate"] = div.xpath("./div[@class='c_title']//h2/text()").extract_first()
         a_list = div.xpath(".//div[@class='option_group clearfix']/div")  # 小分类列表
         for a in a_list:
             item["s_href"] = a.xpath("./a/@href").extract_first()
             item["s_cate"] = a.xpath("./a/text()").extract_first()
             if item["s_href"] is not None:
                 item["s_href"] = urljoin(response.url,item["s_href"])
                 yield scrapy.Request(
                     item["s_href"],
                     callback=self.parse_stock_list,
                     meta={"item":deepcopy(item),
                           'download_timeout': 10,
                           },
                     dont_filter=True
                 )
コード例 #12
0
    def parse_stock(self, response):
        sItem = StockItem()
        try:
            stockInfo = response.css('.stock-bets')
            betsname = stockInfo.css('.bets-name')
            sItem['code'] = betsname.css('span::text').extract_first()
            sItem['name'] = betsname.css(
                '::text').extract_first()[1:-1].strip()
            state = stockInfo.css('span')[1]
            sItem['trade_date'] = state.re_first("\d{4}-\d{2}-\d{2}")
            sItem['time'] = state.re_first("\d{2}:\d{2}:\d{2}")
            #txtState = state.re_first("\">(.*?) ")
            #print(txtState)
            sItem['close'] = stockInfo.css('._close::text').extract_first(
            ) if sItem['time'][0:2] == '15' else None  #15:收盘
            sItem['last'] = stockInfo.css('strong::text').extract_first()
            valueList = stockInfo.css('dd')
            sItem['open'] = valueList[0].css('::text').extract_first()
            sItem['volume'] = valueList[1].css('::text').extract_first()[0:-2]
            sItem['high'] = valueList[2].css('::text').extract_first()
            sItem['limit_up'] = valueList[3].css('::text').extract_first()
            sItem['turnover'] = valueList[5].css(
                '::text').extract_first()[0:-1]
            sItem['pe'] = valueList[8].css('::text').extract_first()
            sItem['total_equity'] = valueList[10].css(
                '::text').extract_first()[0:-1]
            sItem['preclose'] = valueList[11].css('::text').extract_first()
            sItem['turnover_rate'] = valueList[12].css(
                '::text').extract_first()[0:-1]
            sItem['low'] = valueList[13].css('::text').extract_first()
            sItem['limit_down'] = valueList[14].css(
                '::text').extract_first()[1:].lstrip()
            sItem['volume_ratio'] = valueList[17].css(
                '::text').extract_first()[0:-1]
            sItem['flow_equity'] = valueList[21].css(
                '::text').extract_first()[0:-1]

        except:
            pass

        yield sItem
コード例 #13
0
    def parse(self, response):
        self.count = 0

        for sel in response.xpath(
                '//li//a[contains(@href, "http://quote.eastmoney.com/hk")]/@href'
        ):
            #debug
            if self.count >= 10000:
                continue
            #debug
            link = sel.extract()
            num = link.split("/")[-1].split(".")[0]
            if num.isdigit():
                num = int(num)
            else:
                continue
            item = StockItem()
            item['num'] = num
            item['link'] = link
            self.count += 1
            yield item
コード例 #14
0
 def parse(self, response):
     sel = Selector(response)
     stocks = sel.xpath("//tbody/tr")
     items = []
     for stock in stocks:
         item = StockItem()
         item['code'] = stock.xpath("td[2]/a/text()")[0].extract()
         item['link'] = stock.xpath("td[2]/a[1]/@href")[0].extract()
         item['name'] = stock.xpath("td[3]/a/text()")[0].extract()
         item['curprice'] = stock.xpath("td[4]/text()")[0].extract()
         item['change_rate'] = stock.xpath("td[5]/text()")[0].extract()
         item['change_price'] = stock.xpath("td[6]/text()")[0].extract()
         item['up_speed'] = stock.xpath("td[7]/text()")[0].extract()
         item['turnover_rate'] = stock.xpath("td[8]/text()")[0].extract()
         item['ratio'] = stock.xpath("td[9]/text()")[0].extract()
         item['amplitude'] = stock.xpath("td[10]/text()")[0].extract()
         item['turnover_volume'] = stock.xpath("td[11]/text()")[0].extract()
         item['tradable_shares'] = stock.xpath("td[12]/text()")[0].extract()
         item['circulation_market_value'] = stock.xpath(
             "td[13]/text()")[0].extract()
         item['P_E_ratio'] = stock.xpath("td[14]/text()")[0].extract()
         items.append(item)
     return items
コード例 #15
0
ファイル: flush.py プロジェクト: NeoGeon/baseSpider
 def parse(self, response):
     text = response.text
     soup = BeautifulSoup(text, 'lxml')
     print 'cur', self.curr_stock_code
     start_time = soup.find('input', {
         'name': 'date_start_type'
     }).get('value').replace('-', '')
     end_time = soup.find('input', {
         'name': 'date_end_type'
     }).get('value').replace('-', '')
     file_item = StockItem()
     if len(self.curr_stock_code) > 0:
         stock_code_a = str(self.curr_stock_code)
         if int(stock_code_a[0]) in (0, 2, 3, 6, 9):
             if int(stock_code_a[0]) in [6, 9]:
                 new_stock_code = '0' + stock_code_a
             if int(stock_code_a[0]) in [0, 2, 3]:
                 if not int(stock_code_a[0:3]) in (201, 202, 203, 204):
                     new_stock_code = '1' + stock_code_a
         download_url = 'http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP'.format(
             new_stock_code, start_time, end_time)
         file_item['file_urls'] = [download_url]
     yield file_item
コード例 #16
0
ファイル: stockAnaly.py プロジェクト: github4n/stock-2
    def parse(self, response):
        #with open("stock.txt", "w") as f:
        #f.write(response.text)
        #//div[@class ='em graph alignCenter']/a
        #a_list = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[2]/div/table/tr[1]/td[3]/div')
        '''
        te = open("D:/Works/PycharmProjects/stock/view-source.txt", 'rt', encoding='UTF8')
        file_text = te.read()        
        te.close()
        response = response.replace(body = file_text)
        '''
        item = StockItem()
        nodeList = response.xpath(
            '//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[1]/ul/li')

        retryCount = 0
        if (len(nodeList) == 0):  #没有抓取到数据
            time.sleep(1)
            retryCount = retryCount + 1
            '''
            yield scrapy.FormRequest(
                url = self.start_urls[0], 
                headers = self.headers, 
                cookies = self.cookies,
                method = 'GET',
                meta={},
                formdata = self.querystring, 
                callback = self.parse,
                errback = self.error,
                dont_filter = True
            )
            '''
        else:
            #attrList = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[1]/ul/li/div[starts-with(@class,"em")]').extract()
            #//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[1]/ul/li[30]/dl/dt/div/span[1]
            #item['code'] = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[2]/div/div/ul/li[3]/div/text()').extract()
            #item['name'] = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[2]/div/div/ul/li[4]/div/text()').extract()

            start = time.clock()

            #testList = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[2]/table/tbody/tr[*]')
            #text = '/td[{1}]/div/'
            #test11 = testList.xpath(text+'a/text()' + '|' + text+'text()' + '|' + text+'span[1]/text()').extract()

            end = time.clock()
            print(str(end - start) + '\n')
            start = end

            item['minLen'] = 10000
            item['code'] = response.xpath(
                '//div[@id="tableWrap"]/div[2]/div/div[2]/div/table/tbody/tr[*]/td[3]/div/text()'
            ).extract()
            end = time.clock()
            print(str(end - start) + '\n')
            start = end
            item['name'] = response.xpath(
                '//div[@id="tableWrap"]/div[2]/div/div[2]/div/table/tbody/tr[*]/td[4]/div/a/text()'
            ).extract()
            end = time.clock()
            print(str(end - start) + '\n')
            start = end

            col = 1
            for node in nodeList:
                colNum = 0
                if (len(node.xpath('./dl'))):  #如果是分列的
                    colNum = len(node.xpath('./dl/dd')) - 1
                    if (len(node.xpath('./dl/dt/div/span'))):
                        attr = node.xpath(
                            './dl/dt/div/span[1]/text()').extract()[0]
                    else:
                        attr = node.xpath('./dl/dt/div/text()').extract()[0]
                else:
                    if (len(node.xpath('./div[1]/span'))):  #有解释
                        attr = node.xpath(
                            './div[1]/span[1]/text()').extract()[0]
                    else:
                        attr = node.xpath('./div[1]/text()').extract()[0]

                end = time.clock()
                print(str(end - start) + '\n')
                start = end

                string = attrStrCmp(attr)
                print(attr + ' ')
                print(string + '\n')
                #print(type(attr), len(attr))
                #print(type(string), len(string))

                text = '//div[@class="scroll_tbody_con"]/table/tbody/tr[*]/td[{:d}]/div/'.format(
                    col)
                '''
                ddd  = response.xpath(text+'a')
                if(len(response.xpath(text+'a'))):#有链接
                    item[string] = response.xpath(text+'a/text()').extract()
                else:
                    item[string] = response.xpath(text+'text()').extract()
                '''
                item[string] = response.xpath(text + 'a/text()' + '|' + text +
                                              'text()' + '|' + text +
                                              'span[1]/text()').extract()
                col = col + 1 + colNum
                end = time.clock()
                print(str(end - start) + '\n')
                start = end

                item['minLen'] = min(item['minLen'], len(item[string]))

            self.f = open("D:/share/自由流通市值.txt", "w")
            for i in range(len(item['code'])):
                self.f.write(item['code'][i] + ' ' + item['name'][i] + ' ' +
                             item['自由流通股'][i] + ' ' + item['股性评分'][i] + '\n')
            self.f.write('\n')
            self.f.close()
            '''
            self.f = open("D:/share/result11.txt", "a+")
            print(len(item['code']))
            for i in range(len(item['code'])):
                self.f.write(item['code'][i] + '\n')
            print(len(item['name']))
            for i in range(len(item['name'])):
                self.f.write(item['name'][i] + '\n')
            print(len(item['liuTongShiZhi']))
            for i in range(len(item['liuTongShiZhi'])):
                self.f.write(item['liuTongShiZhi'][i] + '\n')
            self.f.close()

            lists = []
            for i in range(len(item['code'])):
                
                #list = [item['code'][i], item['name'][i], item['股流通市值'][i], item['大单净量'][i], item['换手率'][i], item['涨跌幅'][i], item['量比'][i], \
                #item['现价'][i], item['振幅'][i], item['机构动向'][i], item['股性评分'][i], item['大单净额'][i], item['主力资金流向'][i], item['中单净额'][i], \
                #item['小单净额'][i], item['净利润同比增长率'][i], item['净利润'][i]]
                
                list = []
                list.append(item['code'][i])
                list.append(item['name'][i])
                list.append(item['liuTongShiZhi'][i])
                list.append(item['大单净量'][i])
                list.append(item['换手率'][i])
                list.append(item['涨跌幅'][i])
                list.append(item['量比'][i])
                list.append(item['现价'][i])
                list.append(float(item['主动买入比'][i]) - float(item['主动卖出比'][i]))
                list.append(float(item['大单买入比'][i]) - float(item['大单卖出比'][i]))
                list.append(item['振幅'][i]) #10
                list.append(item['机构动向'][i])
                list.append(item['股性评分'][i])
                list.append(item['大单净额'][i])
                list.append(item['主力资金流向'][i])
                list.append(item['中单净额'][i])
                list.append(item['小单净额'][i])
                #list.append(item['净利润同比增长率'][i])            
                list.append(item['净利润'][i])
                #
                lists.append(list)
            lists.sort(key = choose_zhuDongMaiBi, reverse = True)    
            
            print(str(len(item['code']))+'\n')
            print(str(len(item['name']))+'\n')
            print(str(len(item['a股流通市值']))+'\n')
            print(str(len(item['dde大单净量']))+'\n')
            print(str(len(item['换手率']))+'\n')
            print(str(len(item['涨跌幅前复权']))+'\n')
            print(str(len(item['量比']))+'\n')
            #print(str(len(item['现价']))+'\n')
            print(str(len(item['主动买入比']))+'\n')
            print(str(len(item['主动卖出比']))+'\n')
            print(str(len(item['振幅']))+'\n')
            print(str(len(item['机构动向']))+'\n')
            print(str(len(item['股性评分']))+'\n')
            print(str(len(item['dde大单净额']))+'\n')
            print(str(len(item['主力资金流向']))+'\n')
            print(str(len(item['中单净额']))+'\n')
            print(str(len(item['小单净额']))+'\n')
            print(str(len(item['净利润同比增长率']))+'\n')
            print(str(len(item['净利润']))+'\nyield\n')
            '''
            yield item

            i = 1
コード例 #17
0
    def parse(self, response):
        page = response.body
        testCss = response.css('#datalist tr')
        for t in testCss:
            # 创建对象
            item = StockItem()
            # 股票代码
            stockId = t.css('td a::text').extract()[0]
            # 简称
            Abbreviation = t.css('td a::text').extract()[1]
            # 最新价
            latestPrice = t.css('td span::text').extract()[0]
            latestPrice = float(latestPrice)
            # 涨跌幅
            quoteChange = t.css('td span::text').extract()[1]
            quoteChange = round(float(quoteChange.strip('%')) / 100, 4)
            # 涨跌额
            amountChange = t.css('td span::text').extract()[2]
            amountChange = float(amountChange)
            # 5分钟涨幅
            increase = t.css('td span::text').extract()[3]
            increase = round(float(increase.strip('%')) / 100, 4)
            # 成交量
            volume = t.css('td::text').extract()[0]
            volume = float(volume)
            # 成交额
            turnover = t.css('td::text').extract()[1]
            turnover = float(turnover)
            # 换手率
            handTurnoverRate = t.css('td::text').extract()[2]
            handTurnoverRate = round(
                float(handTurnoverRate.strip('%')) / 100, 4)
            # 振幅
            amplitude = t.css('td::text').extract()[3]
            amplitude = round(float(amplitude.strip('%')) / 100, 4)
            # 量比
            volumeRatio = t.css('td::text').extract()[4]
            volumeRatio = float(volumeRatio)
            # 委比
            commission = t.css('td::text').extract()[5]
            commission = float(commission)
            # 市盈率
            PERatio = t.css('td::text').extract()[6]
            if (PERatio == '--'):
                PERatio = float("0")
            else:
                PERatio = float(PERatio)

            item['stockId'] = stockId
            item['Abbreviation'] = Abbreviation
            item['latestPrice'] = latestPrice
            item['quoteChange'] = quoteChange
            item['amountChange'] = amountChange
            item['increase'] = increase
            item['volume'] = volume
            item['turnover'] = turnover
            item['handTurnoverRate'] = handTurnoverRate
            item['amplitude'] = amplitude
            item['volumeRatio'] = volumeRatio
            item['commission'] = commission
            item['PERatio'] = PERatio

            # text = stockId+"   "+Abbreviation
            # print(text)
            yield item
コード例 #18
0
    def aNewPage(self, response):
        print('正在爬取' + response.url + '信息')
        # 此时要将爬取到的数据存入item中了
        # 引入from news.items import NewsItem 新建item对象
        item = StockItem()
        # 观察页面中所需信息的xpath信息再利用xpath存入对应的item字段
        item['url'] = [response.url]
        a = item['url']
        # print(item['url'][:1])
        # b='stock.10'
        # c='yicai'
        # d='21jingji'
        e = 'weixin'
        # f='jiemian'
        # g='wabei'
        # h='sohu'
        # if b in a[0]:
        #     print(a[0])
        #     print('nih')
        #     item['title'] = response.xpath('//h2[@class="main-title"]/text()').extract()
        #     print(item['title'])
        #     content = response.xpath('//div[@class="main-text atc-content"]/p/text()').extract()
        #     # 注意爬取到的content为多段<p>标签组成 需要合并处理
        #     item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ')
        #     print(item['content'] )

        # elif c in a[0]:
        #     print(c)
        #     item['title'] = response.xpath('//div[@class="m-list7 f-white"]/h1/text()').extract()
        #     print(item['title'])
        #     content = response.xpath('//div[@class="txt"]//p/text()').extract()
        #     # 注意爬取到的content为多段<p>标签组成 需要合并处理
        #     item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ')
        #     print(item['content'] )
        #
        # elif d in a[0]:
        #     print(d)
        #     item['title'] = response.xpath('//div[@class="titlehead"]/h1/text()').extract()
        #     print(item['title'])
        #     content = response.xpath('//div[@class="txtContent"]//p/text()').extract()
        #     # 注意爬取到的content为多段<p>标签组成 需要合并处理
        #     item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ')
        #     print(item['content'] )

        if e in a[0]:
            print(e)
            item['title'] = response.xpath(
                '//h2[@class="rich_media_title"]/text()').extract()
            print(item['title'])
            content = response.xpath(
                '//div[@class="rich_media_content"]/span/text()').extract()
            # 注意爬取到的content为多段<p>标签组成 需要合并处理
            item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ')
            print(item['content'])
        #
        # elif f in a[0]:
        #     print(f)
        #     item['title'] = response.xpath('//div[@class="article-header"]/h1/text()').extract()
        #     print(item['title'])
        #     content = response.xpath('//div[@class="article-content"]/p/text()').extract()
        #     # 注意爬取到的content为多段<p>标签组成 需要合并处理
        #     item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ')
        #     print(item['content'] )
        #
        # elif g in a[0]:
        #     print(g)
        #     item['title'] = response.xpath('//div[@class="subject"]/h1/text()').extract()
        #     print(item['title'])
        #     content = response.xpath('//div[@class="subject-content"]/p/text()').extract()
        #     # 注意爬取到的content为多段<p>标签组成 需要合并处理
        #     item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ')
        #     print(item['content'] )
        #
        # elif h in a[0]:
        #     print(h)
        #     item['title'] = response.xpath('//div[@class="text-title"]/h1/text()').extract()
        #     print(item['title'])
        #     content = response.xpath('//article[@class="article"]/p/text()').extract()
        #     # 注意爬取到的content为多段<p>标签组成 需要合并处理
        #     item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ')
        #     print(item['content'] )

        # item['time'] = response.xpath('//div[@class="artical-info"]/span/a/span/text()').extract()
        # item['source'] = response.xpath('//div[@class="artical-info"]/span/span/a/text()').extract()
        # item['img'] = response.xpath('//div[@class="artical-importantPic"]/img/@src').extract()
        # content = response.xpath('//div[@class="main-text atc-content"]/p/text()').extract()
        # 注意爬取到的content为多段<p>标签组成 需要合并处理
        # item['content'] = ["\n".join(content)][0].replace(u'\u3000',u' ')
        # 爬取到的数据交给pipelines处理
        yield item
コード例 #19
0
    def parse_taichinh(self, response):
        print(11111)
        item = StockItem()
        if response.body:
            re_mack = re.match(r'.*&scode=(.*)&bizType.*', response.url)
            item['mack'] = re_mack.group(1)
            i = 1
            for sel in response.xpath("//tr[@class='BR_rowHeader']/td[position() > 1]"):
                item['date'] = sel.xpath('./text()').extract_first('').strip()

                item['dthuthuanve_bh_va_ccdv'] = response.xpath(
                    "//tr[@id='2216']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['giavon_bh'] = response.xpath("//tr[@id='2207']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['loinhuangopve_bh_va_ccdv'] = response.xpath(
                    "//tr[@id='2217']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['dthuhoatdong_tc'] = response.xpath("//tr[@id='2221']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['chiphi_tc'] = response.xpath("//tr[@id='2222']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['chiphi_bh'] = response.xpath("//tr[@id='2227']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['chiphi_qldn'] = response.xpath("//tr[@id='2224']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['loinhuanthuanve_hdkd'] = response.xpath("//tr[@id='2208']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['loinhuankhac'] = response.xpath("//tr[@id='2209']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['phanloinhuantucty_lkkd'] = response.xpath(
                    "//tr[@id='2210']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['tongloinhuanketoantrcthue'] = response.xpath(
                    "//tr[@id='2211']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['loinhuansauthue_tndn'] = response.xpath("//tr[@id='2212']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['loinhuansauthue_cuacongdongctyme'] = response.xpath(
                    "//tr[@id='2214']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['lai_cb_tren_cp'] = response.xpath("//tr[@id='2215']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()

                item['ts_nganhan'] = response.xpath("//tr[@id='3000']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['tienvacackhoan_td_tien'] = response.xpath(
                    "//tr[@id='3003']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['cackhoandautu_tcnh'] = response.xpath("//tr[@id='3004']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['cackhoanphaithunganhan'] = response.xpath(
                    "//tr[@id='3005']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['hangtonkho'] = response.xpath("//tr[@id='3006']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['taisannganhankhac'] = response.xpath("//tr[@id='3007']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['taisandaihan'] = response.xpath("//tr[@id='3001']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['taisancodinh'] = response.xpath("//tr[@id='3009']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['bds_dautu'] = response.xpath("//tr[@id='3010']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['cackhoandautu_tcdh'] = response.xpath("//tr[@id='3011']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['tongcongtaisan'] = response.xpath("//tr[@id='2996']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['nophaitra'] = response.xpath("//tr[@id='2997']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['nonganhan'] = response.xpath("//tr[@id='3014']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['nodaihan'] = response.xpath("//tr[@id='3017']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['vochusohuu'] = response.xpath("//tr[@id='2998']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['vondautucua_csh'] = response.xpath("//tr[@id='3063']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['thangduvon_cp'] = response.xpath("//tr[@id='3064']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()
                item['loinhuansauthue_chuapp'] = response.xpath(
                    "//tr[@id='3072']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['loiichcuacongdongthieuso'] = response.xpath(
                    "//tr[@id='3002']/td[{}]/text()".format(i + 2)).extract_first('').strip()
                item['tongcongnguonvon'] = response.xpath("//tr[@id='2999']/td[{}]/text()".format(i + 2)).extract_first(
                    '').strip()

                item['eps'] = response.xpath("//tr[@id='53']/td[{}]/text()".format(i + 3)).extract_first('').strip()
                item['bvps'] = response.xpath("//tr[@id='54']/td[{}]/text()".format(i + 3)).extract_first('').strip()
                item['p_e'] = response.xpath("//tr[@id='55']/td[{}]/text()".format(i + 3)).extract_first('').strip()
                item['p_b'] = response.xpath("//tr[@id='57']/td[{}]/text()".format(i + 3)).extract_first('').strip()
                item['tisuatloinhuangopbien'] = response.xpath("//tr[@id='41']/td[{}]/text()".format(i + 3)).extract_first(
                    '').strip()
                item['tisuatsinhloitrendoanhthuthuan'] = response.xpath(
                    "//tr[@id='44']/td[{}]/text()".format(i + 3)).extract_first('').strip()
                item['roea'] = response.xpath("//tr[@id='45']/td[{}]/text()".format(i + 3)).extract_first('').strip()
                item['roaa'] = response.xpath("//tr[@id='47']/td[{}]/text()".format(i + 3)).extract_first('').strip()
                item['tisothanhtoanhienhanh'] = response.xpath("//tr[@id='4']/td[{}]/text()".format(i + 3)).extract_first(
                    '').strip()
                item['khanangthanhtoanvaylai'] = response.xpath("//tr[@id='5']/td[{}]/text()".format(i + 3)).extract_first(
                    '').strip()
                item['tisonotrentrongtaisan'] = response.xpath("//tr[@id='8']/td[{}]/text()".format(i + 3)).extract_first(
                    '').strip()
                item['tisonotrenvonchusohuu'] = response.xpath("//tr[@id='11']/td[{}]/text()".format(i + 3)).extract_first(
                    '').strip()

                yield item