def __init__(self): self.mongo_col = get_col(MONGODB_COLLECTION) self.mysql_Client = MysqlClient( host=MYSQL_SERVER, port=MYSQL_PORT, database=MYSQL_DB, user=MYSQL_USER, password=MYSQL_PASSWORD )
def update_handle_task(self, task, status, retry=5): while retry > 0: try: self.mongo_col.update({'_id': task['_id']}, {'$set': {'handled': status}}) except Exception as err: logging.error(f'{str(err)}, {retry} times to retry') retry -= 1 self.mongo_col = get_col(MONGODB_COLLECTION) else: break time.sleep(1) else: logging.error(f'update handle task failed, {task["_id"]}--{status}, max times retry')
def get_handle_task(self, retry=5): task = None while retry > 0: try: task = self.mongo_col.find_one_and_update( {'$and': [{'crawled': 1}, {'handled': 0}]}, {'$set': {'handled': 2}}) except PyMongoError as err: logging.error(f'{str(err)}, {retry} times to retry') retry -= 1 self.mongo_col = get_col(MONGODB_COLLECTION) else: break time.sleep(1) else: logging.error(f'{str(err)}, max times retry') return task
class CdmbcSpider(scrapy.Spider): name = 'cdmbc' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): headers = copy.deepcopy(self.headers) headers.update({ 'X-Requested-With': 'XMLHttpRequest', 'Host': 'swgl.cdmbc.gov.cn' }) url = 'http://swgl.cdmbc.gov.cn/egrantweb/notice/noticeList?flag=grid¬iceType=3' yield scrapy.FormRequest(url, method='GET', headers=headers) def parse(self, response): json_data = json.loads(json.dumps(xmltodict.parse(response.body))) page_count = json_data['rows'].get('total', None) if page_count is None: logging.error('get page_count failed') return form_data = { '_search': 'false', 'nd': str(int(time.time()*1000)), 'rows': '10', 'sidx': '', 'sord': 'desc', 'searchString': '', } headers = copy.deepcopy(self.headers) headers.update({ 'X-Requested-With': 'XMLHttpRequest', 'Host': 'swgl.cdmbc.gov.cn' }) page_count = min(int(page_count), self.max_page) url = 'http://swgl.cdmbc.gov.cn/egrantweb/notice/noticeList?flag=grid¬iceType=3' for page in range(1, page_count+1): form_data.update({'page': str(page)}) yield scrapy.FormRequest(url, method='POST', headers=headers, formdata=form_data, callback=self.parse_page) def parse_page(self, response): headers = copy.deepcopy(self.headers) headers.update({'Host': 'www.cdmbc.gov.cn'}) json_data = json.loads(json.dumps(xmltodict.parse(response.body))) for row in json_data['rows']['row']: url = re.findall(r'href="(.*?)"|$', row['cell'][0])[0] if url == '' or 'cdmbc' not in url: logging.warning(f'{response.url}--{url}: get data failed') continue date = row['cell'][1] unique_id = get_md5(url) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['source'] = '成都市商务委' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): item = response.meta['item'] selector = etree.HTML(response.body) regex = r'//div[@id="detail"]' title = response.xpath(r'//div[@class="detailBox"]/h2/text()').extract_first(default='').strip() content = response.xpath(regex).xpath('string(.)').extract_first(default='').strip() if (title == '') and (content == ''): logging.warning(f'{item["url"]}: title and content is none') return item['summary'] = content[:100] if (content != '') else title try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') item['title'] = title yield item
class CsidcSpider(scrapy.Spider): name = 'csidc' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) base_url = 'http://125.70.9.164:1250/csidc/web/list.jsp?lm_id=notice' start_urls = [ base_url, ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection: ': 'keep-alive', # 'Host': '125.70.9.164:1250', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def parse(self, response): page_count = re.findall(r'\d+/(\d+)\s*页|$', response.text)[0] if page_count in [b'', '']: raise Exception('get page count failed') page_count = min(int(page_count), self.max_page) for page in range(2, int(page_count) + 1): url = self.base_url form_data = {'pageid': str(page)} yield scrapy.FormRequest(url, method='POST', formdata=form_data, headers=self.headers, callback=self.parse_page) for request in self.parse_page(response): yield request def parse_page(self, response): base_url = 'http://125.70.9.164:1250/csidc/csidc2/site/noticeView.jsp?id=' regex = r'//tr[@class="btd"]' for sel in response.xpath(regex): link = sel.xpath(r'td[1]/a/@href').re( r'javascript:display\(\'(.*?)\'\)') title = sel.xpath(r'td[1]/a/text()').extract_first( default='').strip() date = sel.xpath(r'td[3]/text()').extract_first(default='').strip() lst = [link, date] if not all(lst): logging.warning(f'{response.url}.{link}: get data failed') continue url = base_url + link[0] unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(date, "%Y-%m-%d %H:%M")) item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['date'] = date item['title'] = title item['origin'] = self.name item['source'] = '成都市经济和信息化委员会' item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): selector = etree.HTML(response.body) item = response.meta['item'] regex = r'/html/body/table[2]/tr[3]/td/table/tr[3]/td' content = response.xpath(regex).xpath('string(.)').extract_first( default='').strip() # if content == '': # logging.warning(f'{item["url"]}: content is none') # return item['summary'] = content[:100] if (content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') yield item
class CdsmeSpider(scrapy.Spider): name = 'cdsme' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) start_urls = [ 'http://www.cdsme.com/list.aspx?id=79&Model_id=9&page={}', 'http://www.cdsme.com/search.aspx?m=zhengcexinxi&page={}', ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.cdsme.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): for url in self.start_urls: for page in range(1, self.max_page + 1): yield scrapy.FormRequest(url.format(page), method='GET', headers=self.headers) def parse(self, response): base_url = 'http://www.cdsme.com' regex = '//div[@class="clearFix MN_A1_box"]' for sel in response.xpath(regex): link = sel.xpath(r'div[1]/a/@href').extract_first(default='').strip() summary = sel.xpath(r'div[2]/div/p/text()').extract_first(default='').strip() date = sel.xpath(r'div[1]/p/text()').extract_first(default='').strip() lst = [link, date] if not all(lst): logging.warning(f'{response.url}--{link}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['summary'] = summary item['source'] = '成都市中小企业网' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): selector = etree.HTML(response.body) item = response.meta['item'] regex = r'//div[@id="neirong"]' title = response.xpath(r'//div[@class="JR_A1_box"]/p/text()').extract_first(default='').strip() content = response.xpath(regex).xpath('string(.)').extract_first(default='').strip() if title == '' and content == '': logging.warning(f'{item["url"]}: title and content is none') return if item['summary'] == '': item['summary'] = content[:100] if (content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') item['title'] = title yield item
class CdstSpider(scrapy.Spider): name = 'cdst' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) type_lst = [ ('22', '41'), ('22', '190'), ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.cdst.gov.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): for type_ in self.type_lst: for request in self.create_request(type_): yield request def parse(self, response): base_url = 'http://www.cdst.gov.cn/Readnews.asp?NewsID={}' for sel in response.xpath(r'//div[@class="listline"]/li'): title = sel.xpath(r'a/@title').extract_first(default='').strip() unique_id = sel.xpath('a/@href').extract_first(default='').strip() date = sel.xpath(r'span/text()').extract_first(default='').strip() lst = [title, unique_id, date] if not all(lst): logging.warning(f'{response.url}: get data failed') continue url = base_url.format(unique_id.split('=')[-1]) unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '成都市科学技术局' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): item = response.meta['item'] selector = etree.HTML(response.body) regex = r'//div[@class="news_content"]' content = response.xpath(regex).xpath('string(.)').extract_first( default='').strip() # if content == '': # logging.warning(f'{item["url"]}: content is none') # return item['summary'] = content[:100] if (content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') yield item def create_request(self, type_): params = { 'TypeID': type_[0], 'BigClassID': type_[1], } url = 'http://www.cdst.gov.cn/Type.asp' for page in range(1, self.max_page + 1): params.update({'page': str(page)}) yield scrapy.FormRequest(url, method='GET', formdata=params, headers=self.headers)
class CdhtSpider(scrapy.Spider): name = 'cdht' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.cdht.gov.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): url = 'http://www.cdht.gov.cn/zwgktzgg/index.jhtml' yield scrapy.FormRequest(url, method='GET', headers=self.headers) def parse(self, response): page_count = re.findall(r'共\d+条记录\s*\d+/(\d+)\s*页|$'.encode('utf-8'), response.body)[0] if page_count == b'': raise Exception('get page count failed') base_url = 'http://www.cdht.gov.cn/zwgktzgg/index_{}.jhtml' page_count = min(int(page_count), self.max_page) for i in range(1, int(page_count) + 1): url = base_url.format(i) yield scrapy.FormRequest(url, method='GET', headers=self.headers, callback=self.parse_page) def parse_page(self, response): regex = '//div[@class="news-list-list"]/table[@class="table"]/tbody/tr' for sel in response.xpath(regex): link = sel.xpath(r'td[1]/a/@href').extract_first( default='').strip() title = sel.xpath(r'td[1]/a/text()').extract_first( default='').strip() source = sel.xpath(r'td[2]/text()').extract_first( default='').strip() date = sel.xpath(r'td[3]/span/text()').extract_first( default='').strip() lst = [link, title, source, date] if not all(lst): logging.warning(f'{response.url}: get data failed') continue url = link unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['source'] = source item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '高新区' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): selector = etree.HTML(response.body) item = response.meta['item'] regex = r'//div[@id="d_content"]' title = response.xpath(r'//div[@class="page"]/h1/text()' ).extract_first(default='').strip() content = response.xpath(regex).xpath('string(.)').extract_first( default='').strip() if (title == '') and (content == ''): logging.warning(f'{item["url"]}: title and content is none') return item['summary'] = content[:100] if (content != '') else title try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') item['title'] = title yield item
class ZgzzscxdSpider(scrapy.Spider): name = 'zgzzscxd' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) ntys = ['1', '3'] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'www.zgzzscxd.com', 'Origin': 'http://www.zgzzscxd.com', 'Pragma': 'no-cache', 'Referer': 'http://www.zgzzscxd.com/NewsList.aspx?NTY=1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): url = 'http://www.zgzzscxd.com/NewsList.aspx?NTY={}' for nty in self.ntys: yield scrapy.FormRequest(url.format(nty), method='GET', headers=self.headers) def parse(self, response): page_count = max([ int(sel.xpath('text()').extract_first(default='')) for sel in response.xpath(r'//div[@id="AspNetPager1"]/a') if sel.xpath('text()').extract_first(default='').isdigit() ]) page_count = min([page_count, self.max_page]) for page in range(2, page_count + 1): url = 'http://www.zgzzscxd.com/NewsList.aspx?NTY=1' form_data = { '__VIEWSTATE': response.xpath(r'//*[@id="__VIEWSTATE"]/@value').extract()[0], '__EVENTTARGET': 'AspNetPager1', '__EVENTARGUMENT': str(page), 'AspNetPager1_input': '2', } yield scrapy.FormRequest(url, method='POST', headers=self.headers, formdata=form_data, callback=self.parse_page) for request in self.parse_page(response): yield request def parse_page(self, response): base_url = 'http://www.zgzzscxd.com/' for sel in response.xpath('//li[@class="clearfix"]'): item = GovInfoItem() link = sel.xpath('a/@href').extract_first(default='').strip() title = sel.xpath('a/@title').extract_first(default='').strip() lst = [link, title] if not all(lst): logging.warning(f'{response.url}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '成都市经济和信息化委员会' item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): item = response.meta['item'] selector = etree.HTML(response.body) regex = r'//div[@class="aa5"]' date = response.xpath(r'//div[@class="aa2"]/span[2]/text()' ).extract_first(default='').strip() content = response.xpath(regex).xpath('string(.)').extract_first( default='').strip() if date == '': logging.warning(f'{item["url"]}: date is none') return date = date.replace(':', ':').split(':', 1)[-1].replace('/', '-') date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(date, "%Y-%m-%d %H:%M:%S")) item['summary'] = content[:100] if (content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') item['date'] = date yield item
class SczwfwSpider(scrapy.Spider): name = 'sczwfw' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) start_urls = [ 'http://www.sczwfw.gov.cn:82/10000/10006/10008/index.shtml', 'http://www.sczwfw.gov.cn:82/10000/10006/10002/index.shtml', 'http://www.sczwfw.gov.cn:82/10000/10010/index.shtml', 'http://www.sczwfw.gov.cn:82/10000/10003/index.shtml', ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.sczwfw.gov.cn:82', 'Pragma': 'no-cache', 'Referer': 'http://www.sczwfw.gov.cn/policylist.aspx?news=tzgg', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): for url in self.start_urls: yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'base_url': url}) def parse(self, response): page_count = response.xpath(r'//input[@id="hPageCount"]/@value' ).extract_first(default='').strip() if page_count in [b'', '']: raise Exception('get page count failed') base_url = response.meta['base_url'] page_count = min([int(page_count), self.max_page]) for page in range(1, page_count): url = base_url.replace('index', 'index_{}').format(page) yield scrapy.FormRequest(url, method='GET', headers=self.headers, callback=self.parse_page) for request in self.parse_page(response): yield request def parse_page(self, response): base_url = 'http://www.sczwfw.gov.cn:82' for sel in response.xpath('//div[@class="news_r"]//li'): item = GovInfoItem() link = sel.xpath('.//a/@href').extract_first(default='').strip() title = sel.xpath('.//a/@title').extract_first(default='').strip() date = sel.xpath('.//em/text()').extract_first(default='').strip() lst = [link, title, date] if not all(lst): logging.warning(f'{response.url}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '四川省人民政府办公厅' item['origin'] = self.name item['date'] = date item['type'] = 'web' item['location'] = '四川省' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): item = response.meta['item'] selector = etree.HTML(response.body) regex = r'//div[@class="deta_ct"]' content = response.xpath(regex).xpath('string(.)').extract_first( default='').strip() # if content == '': # logging.warning(f'{item["url"]}: date or content is none') # return summary = content[:100] if (content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['summary'] = summary.replace(' ', '').replace('em{font-style:normal;}', '') item['content'] = content.decode('utf-8').replace(' ', '').replace( 'em{font-style:normal;}', '') yield item
class CdsjxwSpider(scrapy.Spider): name = 'cdsjxw' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) base_url = 'http://www.cdgy.gov.cn' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.cdgy.gov.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } custom_settings = { 'COOKIES_ENABLED': True, 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): url = 'http://www.cdgy.gov.cn/cdsjxw/c132946/zwxx.shtml' yield scrapy.FormRequest(url, method='GET', headers=self.headers) def parse(self, response): page_count = re.findall( r'createPageHTML\(\'page_div\',(\d+),.*?\)|$'.encode('utf-8'), response.body)[0] if page_count == b'': raise Exception('get page count failed') referer = response.url page_count = min(int(page_count), self.max_page) base_url = 'http://www.cdgy.gov.cn/cdsjxw/c132946/zwxx_{}.shtml' for i in range(2, int(page_count) + 1): headers = copy.deepcopy(self.headers) headers.update({'referer': referer}) url = base_url.format(i) referer = url yield scrapy.FormRequest(url, method='GET', headers=self.headers, callback=self.parse_page) for request in self.parse_page(response): yield request def parse_page(self, response): regex = '//div[@class="newlist_left_cont"]/ul' for sel in response.xpath(regex): link = sel.xpath(r'li[1]/a/@href').extract_first( default='').strip() text = sel.xpath(r'li[2]/text()').extract_first(default='').strip() title = sel.xpath(r'li[1]/a/@title').extract_first( default='').strip() lst = [link, title, text] if not all(lst): logging.warning(f'{response.url}.{link}: get data failed') continue if 'http' not in link: url = self.base_url + link else: url = link if 'cdgy.gov.cn' not in url: logging.warning(f'{url} is out the domain') continue unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue text = ''.join(text.split()) text = re.sub(r'\s|:|:', '', text) promulgator, date = re.findall(r'来源(.*?)发布时间(\d{4}-\d{2}-\d{2})', text)[0] if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = title.strip() item['source'] = promulgator item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): selector = etree.HTML(response.body) item = response.meta['item'] regexs = [r'//div[@id="top"]', r'//div[@class="main-show-txt"]'] for regex in regexs: content = response.xpath(regex).xpath('string(.)').extract_first( default='').strip() item['summary'] = content[:100] if ( content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: continue else: break else: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') yield item
class CdkjfwSpider(scrapy.Spider): name = 'cdkjfw' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) start_urls = [ 'http://www.cdkjfw.com/list/dynamic.html?Id=@bGMZdxP2kmJSJ/qulgXZw==', 'http://www.cdkjfw.com/list/dynamic.html?Id=0eBSLw868g7MG9PJmfLY0w==' ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7', 'Cache-Control': 'no-cache', 'Host': 'www.cdkjfw.com', 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive', 'Referer': 'http://www.cdkjfw.com/list/dynamic.html?Id=@bGMZdxP2kmJSJ/qulgXZw==&page=2', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, method='GET', headers=self.headers, meta={'url': url}) def parse(self, response): page_count = max([ int(sel.xpath('text()').extract_first(default='')) for sel in response.xpath(r'//div[@id="Pages"]/a') if sel.xpath('text()').extract_first(default='').isdigit() ]) page_count = min([page_count, self.max_page]) base_url = response.meta['url'] for page in range(2, page_count + 1): url = base_url + f'&page={page}' yield scrapy.FormRequest(url, method='GET', headers=self.headers, callback=self.parse_page) for request in self.parse_page(response): yield request def parse_page(self, response): base_url = 'http://www.cdkjfw.com' regex = '//ul[@class="point-list"]/li' for sel in response.xpath(regex): link = sel.xpath(r'div/a/@href').extract_first(default='').strip() title = sel.xpath(r'div/a/text()').extract_first( default='').strip() date = sel.xpath(r'div/span/text()').extract_first( default='').strip() lst = [link, title, date] if not all(lst): logging.warning(f'{response.url}--{link}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '成都生产力促进中心' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): selector = etree.HTML(response.body) item = response.meta['item'] regex = r'//div[@class="artical_content"]' content = response.xpath(regex).xpath('string(.)').extract_first( default='').strip() # if content == '': # logging.warning(f'{item["url"]}: content is none') # return item['summary'] = content[:100] if (content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') yield item
class InnocomSpider(scrapy.Spider): name = 'innocom' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) start_urls = [ 'http://www.innocom.gov.cn/gxjsqyrdw/xxtg/list.shtml', ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.innocom.gov.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def parse(self, response): base_url = 'http://www.innocom.gov.cn/gxjsqyrdw/xxtg/list_{}.shtml' page_count = re.findall(r'createPageHTML\(\'page_div\',(\d+),.*?\)|$'.encode('utf-8'), response.body)[0] if page_count in [b'', '']: raise Exception('get page count failed') page_count = min(int(page_count), self.max_page) for page in range(2, page_count+1): url = base_url.format(page) yield scrapy.FormRequest(url, method='GET', headers=self.headers, callback=self.parse_page) for request in self.parse_page(response): yield request def parse_page(self, response): base_url = 'http://www.innocom.gov.cn' regex = '//div[@class="list7"]/ul/li' for sel in response.xpath(regex): link = sel.xpath(r'a/@href').extract_first(default='').strip() title = sel.xpath(r'a/text()').extract_first(default='').strip() date = sel.xpath(r'span/text()').extract_first(default='').strip() lst = [link, title, date] if not all(lst): logging.warning(f'{response.url}--{link}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '科技部火炬高技术产业开发中心' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '高新区' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item) def parse_item(self, response): selector = etree.HTML(response.body) item = response.meta['item'] regex = r'//div[@id="content"]' content = response.xpath(regex).xpath('string(.)').extract_first(default='').strip() # if content == '': # logging.warning(f'{item["url"]}: content is none') # return item['summary'] = content[:100] if (content != '') else item['title'] try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return item['content'] = content.decode('utf-8').replace(' ', '') yield item
class CdhrsipSpider(scrapy.Spider): name = 'cdhrsip' download_delay = 5 max_page = 5 mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) type_lst = ['001', '101'] headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Length': '0', 'Host': 'www.cdhrsip.com', 'Origin': 'http://www.cdhrsip.com', 'Pragma': 'no-cache', 'Referer': 'http://www.cdhrsip.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } base_params = { 'notifi': '1', 'pageNo': '5', 'pageSize': '10', 'orderPublish': '1' } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, }, } def start_requests(self): for type_ in self.type_lst: for request in self.create_request(type_): yield request def parse(self, response): base_url = 'http://www.cdhrsip.com/article/newsInfo?id={}' json_data = json.loads(response.body) records = json_data['records'] for record in records: url = base_url.format(record['id']) unique_id = get_md5(str(record['id'])) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue selector = etree.HTML(record['content']) summary = selector.xpath('string(.)').strip()[:100] summary = summary if (summary not in [b'', '' ]) else record['title'] date = record['publishTime'] if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = record['title'] item['summary'] = summary item['source'] = record['author'] item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 item['content'] = record['content'] yield item def create_request(self, type_): params = copy.deepcopy(self.base_params) params.update({'type': type_}) url = 'http://www.cdhrsip.com/article/list' for page in range(1, self.max_page + 1): params.update({'pageNo': str(page)}) yield scrapy.FormRequest(url, method='GET', formdata=params, headers=self.headers)
class WxgzhTaskSpider(scrapy.Spider): name = 'wxgzh_task' download_delay = 5 days = 5 task_col = get_col('wxgzh_task') task_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) mongo_col = get_col(MONGODB_COLLECTION) mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True) redis_con = get_redis_client() redis_key = 'wxgzh' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'weixin.sogou.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } custom_settings = { 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.WxgzhTaskPipeline': 100, }, 'DOWNLOADER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, 'gov_info.middlewares.WxgzhTaskRotateProxiesSpiderMiddleware': 100, }, # # redis # 'SCHEDULER': "scrapy_redis.scheduler.Scheduler", # 'DUPEFILTER_CLASS': "gov_info.common.utils.MyRFPDupeFilter", # 'SCHEDULER_PERSIST': True, # 'REDIS_START_URLS_AS_SET': True } def start_requests(self): url = 'http://weixin.sogou.com/weixin' self.task_col.update({'crawled': {'$ne': 1}}, {'$set': {'crawled': 0}}, multi=True) for wx_info in WXINFOS: self.create_task(wx_info) while True: task = self.task_col.find_one_and_update({'crawled': 0}, {'$set': {'crawled': 2}}) if task is None: break params = task['params'] self.headers.update({'Referer': task['referer']}) yield scrapy.FormRequest(url, method='GET', formdata=params, headers=self.headers, meta={'task': task}) def parse(self, response): result = 1 url = response.url task = response.meta['task'] params = task['params'] origin = task['origin'] total = re.findall(r'找到约(\d+)条结果|$'.encode('utf-8'), response.body)[0] if total != b'' and int(total) > 10: result = -1 logging.error(f'{url}: {params} page too more') self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}}) redis_values = [] for sel in response.xpath(r'//li[contains(@id, "sogou_vr_")]'): item = GovInfoItem() unique_id = sel.xpath(r'./@d').extract_first(default='').strip() date = sel.xpath(r'div/div/@t').extract_first(default='').strip() source = sel.xpath(r'div/div/a/text()').extract_first(default='').strip() link = sel.xpath(r'div/h3/a/@href').extract_first(default='').strip() name = sel.xpath(r'div/div/a/text()').extract_first(default='').strip() lst = [unique_id, date, source, link] if not all(lst): result = -1 logging.warning(f'{url}: {params}.{link}: get data failed') self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}}) continue unique_id = get_md5(unique_id) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{origin}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue if task['name'] != name: logging.warning(f'{url}: {params}.{link}: is not publish from {task["name"]}') continue url = link.replace('http', 'https') item['url'] = url item['task_unique_id'] = task['unique_id'] item['unique_id'] = unique_id try: item['summary'] = sel.xpath(r'./div/p[@class="txt-info"]')[0].xpath( 'string(.)').extract_first('').strip() except Exception as err: logging.warning(f'{url}: {params}.{link}: get summary failed') item['summary'] = '' item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(date))) item['source'] = source item['origin'] = origin item['type'] = 'wxgzh' item['crawled'] = 0 item['location'] = task['location'] redis_values.append(json.dumps({'item': dict(item)})) if redis_values: self.redis_con.sadd("{}".format(self.redis_key), *redis_values) def create_task(self, wx_info): url = 'http://weixin.sogou.com/weixin' params = { 'type': '2', 'ie': 'utf8', 'query': wx_info.name, 'tsn': '5', 'interation': '', 'wxid': wx_info.wx_id, 'usip': wx_info.name, } date = datetime.datetime.now() days = self.days while days >= 0: t = date.strftime("%Y-%m-%d") unique_id = f'{wx_info.origin}-{t}' data = { 'url': url, 'name': wx_info.name, 'unique_id': unique_id, 'origin': wx_info.origin, 'params': params, 'referer': wx_info.referer, 'crawled': 0, 'location': wx_info.location, } date -= datetime.timedelta(days=1) days -= 1 params.update({'ft': t, 'et': t}) try: self.task_col.insert(data) except DuplicateKeyError: # logging.warning(f'task unique_id {wx_info.origin}-{t} already exists') if datetime.datetime.now().strftime("%Y-%m-%d") == t: self.task_col.update({'unique_id': unique_id}, {'$set': {'crawled': 0}})
class WxgzhSpider(RedisSpider): name = 'wxgzh' download_delay = 5 handle_httpstatus_list = [403, 408, 564, 503] task_col = get_col('wxgzh_task') mongo_col = get_col(MONGODB_COLLECTION) redis_con = get_redis_client() redis_key = 'wxgzh' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'mp.weixin.qq.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } custom_settings = { 'DOWNLOAD_TIMEOUT': 60, 'LOG_FILE': f'logs/{name}.log', 'ITEM_PIPELINES': { 'gov_info.pipelines.GovInfoPipeline': 100, # 'scrapy_redis.pipelines.RedisPipeline': 200, }, 'DOWNLOADER_MIDDLEWARES': { 'gov_info.middlewares.RotateUserAgentMiddleware': 400, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, 'gov_info.middlewares.WxgzhSpiderMiddleware': 100, }, # redis 'SCHEDULER': "scrapy_redis.scheduler.Scheduler", 'DUPEFILTER_CLASS': "gov_info.common.utils.MyRFPDupeFilter", 'SCHEDULER_PERSIST': True, 'REDIS_START_URLS_AS_SET': True } # def start_requests(self): # self.mongo_col.update({'$and': [{'type': 'wxgzh'}, {'crawled': {'$ne': 1}}]}, # {'$set': {'crawled': 0}}, multi=True) # while True: # task = self.mongo_col.find_one_and_update({'crawled': 0}, {'$set': {'crawled': 2}}) # if task is None: # time.sleep(5) # continue # yield scrapy.FormRequest(task['url'], method='GET', headers=self.headers, meta={'task': task}) def parse(self, response): json_data = response.meta['json_data'] item = GovInfoItem(json_data['item']) selector = etree.HTML(response.body) regex = r'//*[@id="js_content"]' title = response.xpath(r'//*[@id="activity-name"]/text()' ).extract_first(default='').strip() content = response.xpath(regex).xpath('string(.)').extract_first( '').strip() if (content == '') and (title == ''): logging.warning(f'{item["url"]}: title and content is none') self.task_col.update({'unique_id': item['task_unique_id']}, {"$set": { 'crawled': -1 }}) return try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return if item['summary'] == '': item['summary'] = content[:100] if ( content != '') else item['title'] item['content'] = content.decode('utf-8').replace(' ', '') item['title'] = title.strip() item['unique_id'] = item['unique_id'] item['crawled'] = 1 yield item