def parse(self, response): type = response.meta['type'] hxs = Selector(response) #个股吧 if type ==1: stocks = hxs.xpath('//div[@class="ngbggulbody list clearfix"]//li/a').extract() #爬取股票论坛的地址和名字 for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) if m_stocks: item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) #url_stock == http://guba.eastmoney.com/list,000766.html yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num) #主题吧 elif type ==2: stocks = hxs.xpath('//div[@class="allzhutilistb"]/ul/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num) #行业吧 elif type ==3: stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num) #概念吧 elif type ==4: stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num)
def parse(self, response): hxs = Selector(response) posts = hxs.xpath('//div[@class="articleh"]').extract() for post in posts: item = GubaItem() item['content'] = {} readnum = Selector( text=post).xpath('//span[@class="l1"]/text()').extract() if readnum: readnum = readnum[0] replynum = Selector( text=post).xpath('//span[@class="l2"]/text()').extract() if replynum: replynum = replynum[0] url = Selector( text=post).xpath('//span[@class="l3"]/a/@href').extract() if url: url = url[0] guba_id = re.search(',(.+).html', response.url).group(1) if str(guba_id) in str(url): m_stock = re.search("(^\/.+)", url) if m_stock: post_url = "http://guba.eastmoney.com" + m_stock.group(1) post_id = re.search('\/(n.+)\.html', url).group(1) item['content']['readnum'] = readnum item['content']['replynum'] = replynum item['content']['post_id'] = post_id yield Request(url=post_url, meta={ 'item': item, 'replynum': replynum }, callback=self.parse_post)
def parse(self, response): try: if response.status == 200: hxs = Selector(response) reply_author_url = response.meta['reply_author_url'] item = GubaItem() item['content'] = {} reply_author_name = hxs.xpath( '//div[@class="taname"]/text()').extract()[0] item['content']['reply_author_name'] = reply_author_name.strip( ) sign_up_time = hxs.xpath('//div[@id="influence"]').extract()[0] sign_up_time = re.search('999;">\((.+)\)<\/span', sign_up_time).group(1).strip() sign_up_time = datetime.strptime(sign_up_time, "%Y-%m-%d") item['content']['sign_up_time'] = sign_up_time item['content']['reply_author_url'] = reply_author_url yield item except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse(self, response): try: if response.status == 200: try: filter_body = response.body.decode('utf8') except: try: filter_body = response.body.decode("gbk") except: try: filter_body = response.body.decode("gb2312") except Exception as ex: print("Decode webpage failed: " + response.url) return filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '', filter_body) response = response.replace(body=filter_body) hxs = Selector(response) item = GubaItem() dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0] dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2}).+', dt).group(1) creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") item['content'] = {} item['content']['create_time'] = creat_time try: #针对发帖者是注册会员 author_url = hxs.xpath( '//div[@id="zwconttbn"]/strong/a/@href').extract()[0] item['content']['author_url'] = author_url except Exception as ex: #针对发帖者不是注册会员 author = hxs.xpath( '//div[@id="zwconttbn"]//span').extract()[0] author = re.search('gray">(.+)<\/span', author).group(1) item['content']['author'] = author try: #针对普通帖子 postcontent = hxs.xpath( '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()' ).extract()[0].strip() if postcontent: item['content']['content'] = postcontent postitle = hxs.xpath( '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()' ).extract()[0].strip() item['content']['title'] = postitle except: #针对问答的帖子 try: postcontent = hxs.xpath( '//div[@class="qa"]//div[contains(@class,"content")]/text()' ).extract() postquestion = postcontent[0] postanswer = postcontent[2].strip( ) + postcontent[3].strip() item['content']['content'] = postquestion item['content']['answer'] = postanswer try: postanswer_time = hxs.xpath( '//div[@class="sign"]/text()').extract() postanswer_time = re.search( '\D+(\d{4}-\d{2}-.+:\d{2})', postanswer_time[1].strip()).group(1) answer_time = datetime.strptime( postanswer_time, "%Y-%m-%d %H:%M:%S") item['content']['answer_time'] = answer_time except Exception as ex: item['content']['answer_time'] = None postitle = "Q&A" item['content']['title'] = postitle except Exception as ex: print("Decode webpage content failed: " + response.url) return replynum = response.meta['replynum'] item['content']['replynum'] = replynum item['content']['reply'] = [] if int(replynum) % 30 == 0: rptotal = int(int(replynum) / 30) else: rptotal = int(int(replynum) / 30) + 1 if rptotal > 0: head = re.search('(.+)\.html', response.url).group(1) reply_url = head + "_" + str(1) + ".html" yield Request(url=reply_url, meta={ 'item': item, 'page': 1, 'rptotal': rptotal, 'head': head }, callback=self.parse_reply) else: yield item print(item) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse(self, response): type = response.meta['type'] hxs = Selector(response) #个股吧 if type == 1: stocks = hxs.xpath( '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/li/a' ).extract() fund_orgs = hxs.xpath( '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/div[@class="ngbglistjjt"]/a' ).extract() funds = hxs.xpath( '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/ul[@class="ngblistul3"]/li/a' ).extract() #爬取股票论坛的地址和名字 for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) if m_stocks: item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group( 1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num) #爬取基金论坛母吧的地址和名字 for fund_org in fund_orgs: m_fund_orgs = re.search('href="(.+)">(.+)<\/a', fund_org) if m_fund_orgs: item = GubaItem() item['content'] = {} url_fund_org = m_fund_orgs.group(1) item['content']['guba_url'] = url_fund_org item['content']['guba_name'] = m_fund_orgs.group(2) yield Request(url=url_fund_org, meta={'item': item}, callback=self.parse_page_num) #爬取基金论坛子吧的地址和名字 for fund in funds: m_funds = re.search('href="(.+)">(.+)<\/a', fund) if m_funds: item = GubaItem() item['content'] = {} url_fund = m_funds.group(1) item['content']['guba_url'] = url_fund item['content']['guba_name'] = m_funds.group(2) yield Request(url=url_fund, meta={'item': item}, callback=self.parse_page_num) #主题吧 elif type == 2: stocks = hxs.xpath( '//div[@class="allzhutilistb"]/ul/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num) #行业吧 elif type == 3: stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num) #概念吧 elif type == 4: stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num)