def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('#title::text').extract_first() txt = result['content'] publish_time = response.css('#info::text').extract_first() time = get_times(publish_time) item = HyxhItem() content_css = [ '#maininfo' ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '浙江省船舶行业协会' item['website'] = '浙江省船舶行业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 2 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'zhejaing_Aship2' item['module_name'] = '行业协会' yield item
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('h1::text').extract_first() txt = result['content'] publish_time = result['publish_time'] time = get_times(publish_time) item = HyxhItem() content_css = [ '.wof' ] lyurl = response.url for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '中国机械工业联合会' item['website'] = '中国机械工业联合会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'chinaFeature1' item['module_name'] = '行业协会' yield item
def parse_items(self, response): start = response.text.index('<div class="wrapper">') text = response.text[start:] selector = Selector(text=text) lyurl = response.url title = selector.css('.titles h1::text').extract_first() publish_times = selector.css('.titles .fl::text').extract() publish_time = publish_times[1] time = get_times(publish_time) item = HyxhItem() content_css = ['.inf_arct'] for content in content_css: content = ''.join(selector.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') txt = ''.join(selector.xpath('//div[@id="article"]//text()').extract()) item['title'] = title appendix, appendix_name = get_attachments(selector) item['appendix'] = appendix item['source'] = '浙江省新材料产业协会' item['website'] = '浙江省新材料产业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 3 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'zhejiang_clxh' item['module_name'] = '行业协会' yield item
def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('.nrBt01::text').extract_first() txt = result['content'] publish_time = response.xpath( '//*[@id="ctl00_main_panel3"]/table[2]/tr/td[1]/text()' ).extract_first() time = get_times(publish_time) item = HyxhItem() content_css = ['.nrTxt02'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '中国海洋工程咨询协会' item['website'] = '中国海洋工程咨询协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'china_ocean' item['module_name'] = '行业协会' yield item
def parse_items(self, response): lyurl = response.url if lyurl.find( 'http://jamia.org.cn/index.php?g=&m=contents&a=index&term_id=31&page=' ) < 0: extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('h2::text').extract_first() txt = result['content'] publish_time = result['publish_time'] time = get_times(publish_time) item = HyxhItem() content_css = [ '/html/body/div[1]/div[4]/table/tbody/tr/td[2]/table/tbody/tr[2]/td/table/tbody/tr[4]' ] for content in content_css: content = ''.join(response.xpath(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '江苏省新材料产业协会' item['website'] = '江苏省新材料产业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'jiangsu' item['module_name'] = '行业协会' yield item
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.xpath( '/html/body/div[4]/div/div/div[2]/div[3]/div[1]/span/text()' ).extract_first() txt = result['content'] publish_time = ''.join( response.xpath( '/html/body/div[4]/div/div/div[2]/div[3]/div[2]/text()'). extract()) time = get_times(publish_time) item = HyxhItem() content_css = ['/html/body/div[4]/div/div/div[2]/div[3]/div[4]'] lyurl = response.url for content in content_css: content = ''.join(response.xpath(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '上海市生物医药协会' item['website'] = '上海市生物医药协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'shanghai_sbia' item['module_name'] = '行业协会' yield item
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] publish_time = result['publish_time'] time = get_times(publish_time) item = HyxhItem() print(response.url) content_css = [ '.MsoNormal', '#rightcol p', ] lyurl = response.url for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '深圳市医疗器械行业协会' item['website'] = '深圳市医疗器械行业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'shenzhen' item['module_name'] = '行业协会' yield item
def parse_items(self, response): data = json.loads(response.text) selector = Selector(text=data['article_content']) title = ''.join(selector.css('.article_title::text').extract()) time = get_times( selector.css('.article_title p::text').extract_first()) item = HyxhItem() content = ''.join(selector.css('.article_main').extract()) txt = ''.join(selector.css('.article_main *::text').extract()) item['title'] = title appendix, appendix_name = get_attachments(selector) item['appendix'] = appendix item['source'] = '中国钢铁工业协会-新闻动态' item['website'] = '中国钢铁工业协会' item['link'] = response.meta['url'] item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'chinaisa1' item['module_name'] = '行业协会' yield item
def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('h2::text').extract_first() txt = result['content'] publish_time = response.xpath( '//div[@class="main2 ma clear"]/span[@class="riqi_1"][1]/text()' ).extract_first() # publish_time = response.xpath('/html/body/div[4]/span[1]/text()').extract_first() print('publish_time:' + str(publish_time)) time = get_times(publish_time) item = HyxhItem() content_css = ['.para.ma'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '上海市船舶与海洋工程学会' item['website'] = '上海市船舶与海洋工程学会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'shanghai_Aship' item['module_name'] = '行业协会' yield item