def parse_news(self, response): classify, e_name = response.meta.get('info') articles = response.xpath( "//div[@class='largeTitle']//article[contains(@class,'js-article-item')]" ) for article in articles: link_url = article.xpath(".//div[@class='textDiv']/a/@href").get() link_url = response.urljoin(link_url) if xredis.sismember('flashnews:' + e_name, get_md5(link_url)): continue xredis.sadd('flashnews:' + e_name, get_md5(link_url)) title = article.xpath(".//div[@class='textDiv']/a/text()").get() source = article.xpath( ".//div[@class='textDiv']/span/span[1]/text()").get() source = source.replace('提供者', '').strip() description = article.xpath( ".//div[@class='textDiv']/p/text()").getall() description = list_to_str(description) yield scrapy.Request(url=link_url, callback=self.parse_time, dont_filter=True, meta={ "info": (title, source, description, link_url, classify) })
def parse_list(self, response): classify = response.meta.get("info") articles = response.xpath("//div[@id='newslist']/a") for article in articles: article_url = response.urljoin(article.xpath("./@href").get()) if xredis.sismember('flashnews:yicai_news', get_md5(article_url)): continue xredis.sadd('flashnews:yicai_news', get_md5(article_url)) yield scrapy.Request(url=article_url, callback=self.parse_detail, dont_filter=True, meta={"info": classify})
def parse(self, response): lis = response.xpath("//ul[@id='dfx-scrolled-block']//li") for li in lis: link_url = li.xpath(".//div[2]/h3/a/@href").get().strip() if xredis.sismember('flashnews:forex_news', get_md5(link_url)): continue xredis.sadd('flashnews:forex_news', get_md5(link_url)) title = li.xpath(".//div[2]/h3/a/text()").get().strip() pub_time = li.xpath(".//div[1]/h3/text()").get() pub_time = parse_pub_time(pub_time) description = li.xpath(".//div[2]/div/text()").get().strip() yield scrapy.Request( url=link_url, callback=self.parse_content, dont_filter=True, meta={"info": (title, link_url, pub_time, description)})
def parse_next(self,response): stock_code, stock_market, stock_name = response.meta.get('info') time_list = timelist_conversion(response.xpath("//div[@class='datelist']/ul/text()").getall()) articles = response.xpath("//div[@class='datelist']/ul//a") for index, article in enumerate(articles): title = article.xpath("./text()").get() link_url = response.urljoin(article.xpath("./@href").get()) pub_time = time_list[index] link_url_md5 = get_md5(stock_code+link_url) # if filter_url('china_notice',link_url_md5): # continue yield scrapy.Request(url=link_url, callback=self.parse_vip_stock, #dont_filter=True, meta={"info": (stock_code, stock_market, stock_name, title, link_url, pub_time,link_url_md5)})
def parse_next(self, response): stock_code, stock_market, stock_name = response.meta.get('info') lis = response.xpath("//ul[@id='js_ggzx']/li") for li in lis: title = li.xpath("./a/text()").get() if title == None: continue link_url = li.xpath("./a/@href").get() if filter_urlkey(link_url): continue pub_time = li.xpath("./span/text()").get() link_url_md5 = get_md5(stock_code + title) if filter_url('hk_news', link_url_md5): continue if 'vip.stock.finance.sina.com.cn' in link_url: yield scrapy.Request(url=link_url, callback=self.parse_vip_stock, dont_filter=True, meta={ "info": (stock_code, stock_market, stock_name, title, link_url, pub_time, link_url_md5) }) continue if 'stock.finance.sina.com.cn' in link_url: yield scrapy.Request(url=link_url, callback=self.parse_stock_finance, dont_filter=True, meta={ "info": (stock_code, stock_market, stock_name, title, link_url, pub_time, link_url_md5) }) continue if 'finance.sina.com.cn' in link_url or 'cj.sina.com.cn' in link_url or 'tech.sina.com.cn' in link_url: yield scrapy.Request(url=link_url, callback=self.parse_finance_cj, dont_filter=True, meta={ "info": (stock_code, stock_market, stock_name, title, link_url, pub_time, link_url_md5) }) continue
def parse_next(self, response): stock_code, stock_market, stock_name = response.meta.get('info') time_list = timelist_conversion( response.xpath("//div[@class='datelist']/ul/text()").getall()) articles = response.xpath("//div[@class='datelist']/ul//a") for index, article in enumerate(articles): title = article.xpath("./text()").get() link_url = article.xpath("./@href").get() if filter_urlkey(link_url): #去除特殊的url continue pub_time = time_list[index] link_url_md5 = get_md5(stock_code + title) if filter_url('china_news', link_url_md5): #去重 continue if 'vip.stock.finance.sina.com.cn' in link_url: yield scrapy.Request(url=link_url, callback=self.parse_vip_stock, dont_filter=True, meta={ "info": (stock_code, stock_market, stock_name, title, link_url, pub_time, link_url_md5) }) continue if 'stock.finance.sina.com.cn' in link_url: yield scrapy.Request(url=link_url, callback=self.parse_stock_finance, dont_filter=True, meta={ "info": (stock_code, stock_market, stock_name, title, link_url, pub_time, link_url_md5) }) continue if 'finance.sina.com.cn' in link_url or 'cj.sina.com.cn' in link_url or 'tech.sina.com.cn' in link_url: yield scrapy.Request(url=link_url, callback=self.parse_finance_cj, dont_filter=True, meta={ "info": (stock_code, stock_market, stock_name, title, link_url, pub_time, link_url_md5) }) continue
def parse_next(self, response): stock_code, stock_market, stock_name = response.meta.get('info') lis = response.xpath("//ul[@class='list01']/li") for li in lis: title = li.xpath("./a/text()").get() if title == None: continue pub_time = li.xpath("./span/text()").get() link_url = li.xpath("./a/@href").get() link_url_md5 = get_md5(stock_code + link_url) # if filter_url('hk_notice',link_url_md5): # continue yield scrapy.Request( url=link_url, callback=self.parse_stock_finance, #dont_filter=True, meta={ "info": (stock_code, stock_market, stock_name, title, link_url, pub_time, link_url_md5) })