Example #1
0
 def __init__(self):
     self.mongo_col = get_col(MONGODB_COLLECTION)
     self.mysql_Client = MysqlClient(
         host=MYSQL_SERVER,
         port=MYSQL_PORT,
         database=MYSQL_DB,
         user=MYSQL_USER,
         password=MYSQL_PASSWORD
     )
Example #2
0
 def update_handle_task(self, task, status, retry=5):
     while retry > 0:
         try:
             self.mongo_col.update({'_id': task['_id']}, {'$set': {'handled': status}})
         except Exception as err:
             logging.error(f'{str(err)}, {retry} times to retry')
             retry -= 1
             self.mongo_col = get_col(MONGODB_COLLECTION)
         else:
             break
         time.sleep(1)
     else:
         logging.error(f'update handle task failed, {task["_id"]}--{status}, max times retry')
Example #3
0
 def get_handle_task(self, retry=5):
     task = None
     while retry > 0:
         try:
             task = self.mongo_col.find_one_and_update(
                 {'$and': [{'crawled': 1}, {'handled': 0}]}, {'$set': {'handled': 2}})
         except PyMongoError as err:
             logging.error(f'{str(err)}, {retry} times to retry')
             retry -= 1
             self.mongo_col = get_col(MONGODB_COLLECTION)
         else:
             break
         time.sleep(1)
     else:
         logging.error(f'{str(err)}, max times retry')
     return task
Example #4
0
class CdmbcSpider(scrapy.Spider):
    name = 'cdmbc'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True)
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        headers = copy.deepcopy(self.headers)
        headers.update({
            'X-Requested-With': 'XMLHttpRequest',
            'Host': 'swgl.cdmbc.gov.cn'
        })
        url = 'http://swgl.cdmbc.gov.cn/egrantweb/notice/noticeList?flag=grid&noticeType=3'
        yield scrapy.FormRequest(url, method='GET', headers=headers)

    def parse(self, response):
        json_data = json.loads(json.dumps(xmltodict.parse(response.body)))
        page_count = json_data['rows'].get('total', None)
        if page_count is None:
            logging.error('get page_count failed')
            return
        form_data = {
            '_search': 'false',
            'nd': str(int(time.time()*1000)),
            'rows': '10',
            'sidx': '',
            'sord': 'desc',
            'searchString': '',
        }
        headers = copy.deepcopy(self.headers)
        headers.update({
            'X-Requested-With': 'XMLHttpRequest',
            'Host': 'swgl.cdmbc.gov.cn'
        })
        page_count = min(int(page_count), self.max_page)
        url = 'http://swgl.cdmbc.gov.cn/egrantweb/notice/noticeList?flag=grid&noticeType=3'
        for page in range(1, page_count+1):
            form_data.update({'page': str(page)})
            yield scrapy.FormRequest(url, method='POST', headers=headers,
                                     formdata=form_data, callback=self.parse_page)

    def parse_page(self, response):
        headers = copy.deepcopy(self.headers)
        headers.update({'Host': 'www.cdmbc.gov.cn'})
        json_data = json.loads(json.dumps(xmltodict.parse(response.body)))
        for row in json_data['rows']['row']:
            url = re.findall(r'href="(.*?)"|$', row['cell'][0])[0]
            if url == '' or 'cdmbc' not in url:
                logging.warning(f'{response.url}--{url}: get data failed')
                continue
            date = row['cell'][1]
            unique_id = get_md5(url)
            if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}):
                logging.warning(f'{url} is download already, unique_id: {unique_id}')
                continue
            date = date.strip('[').strip(']')
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['source'] = '成都市商务委'
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url, method='GET', headers=headers,
                                     meta={'item': item}, callback=self.parse_item)

    def parse_item(self, response):
        item = response.meta['item']
        selector = etree.HTML(response.body)
        regex = r'//div[@id="detail"]'
        title = response.xpath(r'//div[@class="detailBox"]/h2/text()').extract_first(default='').strip()
        content = response.xpath(regex).xpath('string(.)').extract_first(default='').strip()
        if (title == '') and (content == ''):
            logging.warning(f'{item["url"]}: title and content is none')
            return
        item['summary'] = content[:100] if (content != '') else title
        try:
            content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        item['title'] = title
        yield item
Example #5
0
class CsidcSpider(scrapy.Spider):
    name = 'csidc'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    base_url = 'http://125.70.9.164:1250/csidc/web/list.jsp?lm_id=notice'
    start_urls = [
        base_url,
    ]
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control':
        'no-cache',
        'Connection: ':
        'keep-alive',
        # 'Host': '125.70.9.164:1250',
        'Pragma':
        'no-cache',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def parse(self, response):
        page_count = re.findall(r'\d+/(\d+)\s*页|$', response.text)[0]
        if page_count in [b'', '']:
            raise Exception('get page count failed')

        page_count = min(int(page_count), self.max_page)
        for page in range(2, int(page_count) + 1):
            url = self.base_url
            form_data = {'pageid': str(page)}
            yield scrapy.FormRequest(url,
                                     method='POST',
                                     formdata=form_data,
                                     headers=self.headers,
                                     callback=self.parse_page)

        for request in self.parse_page(response):
            yield request

    def parse_page(self, response):
        base_url = 'http://125.70.9.164:1250/csidc/csidc2/site/noticeView.jsp?id='
        regex = r'//tr[@class="btd"]'
        for sel in response.xpath(regex):
            link = sel.xpath(r'td[1]/a/@href').re(
                r'javascript:display\(\'(.*?)\'\)')
            title = sel.xpath(r'td[1]/a/text()').extract_first(
                default='').strip()
            date = sel.xpath(r'td[3]/text()').extract_first(default='').strip()
            lst = [link, date]
            if not all(lst):
                logging.warning(f'{response.url}.{link}: get data failed')
                continue
            url = base_url + link[0]
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue
            date = time.strftime("%Y-%m-%d %H:%M:%S",
                                 time.strptime(date, "%Y-%m-%d %H:%M"))
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['date'] = date
            item['title'] = title
            item['origin'] = self.name
            item['source'] = '成都市经济和信息化委员会'
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)

    def parse_item(self, response):
        selector = etree.HTML(response.body)
        item = response.meta['item']
        regex = r'/html/body/table[2]/tr[3]/td/table/tr[3]/td'
        content = response.xpath(regex).xpath('string(.)').extract_first(
            default='').strip()
        # if content == '':
        #     logging.warning(f'{item["url"]}: content is none')
        #     return
        item['summary'] = content[:100] if (content != '') else item['title']
        try:
            content = etree.tostring(selector.xpath(regex)[0],
                                     encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        yield item
Example #6
0
class CdsmeSpider(scrapy.Spider):
    name = 'cdsme'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True)
    start_urls = [
        'http://www.cdsme.com/list.aspx?id=79&Model_id=9&page={}',
        'http://www.cdsme.com/search.aspx?m=zhengcexinxi&page={}',
    ]
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Host': 'www.cdsme.com',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        for url in self.start_urls:
            for page in range(1, self.max_page + 1):
                yield scrapy.FormRequest(url.format(page), method='GET', headers=self.headers)

    def parse(self, response):
        base_url = 'http://www.cdsme.com'
        regex = '//div[@class="clearFix MN_A1_box"]'
        for sel in response.xpath(regex):
            link = sel.xpath(r'div[1]/a/@href').extract_first(default='').strip()
            summary = sel.xpath(r'div[2]/div/p/text()').extract_first(default='').strip()
            date = sel.xpath(r'div[1]/p/text()').extract_first(default='').strip()
            lst = [link, date]
            if not all(lst):
                logging.warning(f'{response.url}--{link}: get data failed')
                continue
            url = base_url + link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}):
                logging.warning(f'{url} is download already, unique_id: {unique_id}')
                continue
            date = date.strip('[').strip(']')
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['summary'] = summary
            item['source'] = '成都市中小企业网'
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url, method='GET', headers=self.headers,
                                     meta={'item': item}, callback=self.parse_item)

    def parse_item(self, response):
        selector = etree.HTML(response.body)
        item = response.meta['item']
        regex = r'//div[@id="neirong"]'
        title = response.xpath(r'//div[@class="JR_A1_box"]/p/text()').extract_first(default='').strip()
        content = response.xpath(regex).xpath('string(.)').extract_first(default='').strip()
        if title == '' and content == '':
            logging.warning(f'{item["url"]}: title and content is none')
            return
        if item['summary'] == '':
            item['summary'] = content[:100] if (content != '') else item['title']
        try:
            content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        item['title'] = title
        yield item
Example #7
0
class CdstSpider(scrapy.Spider):
    name = 'cdst'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    type_lst = [
        ('22', '41'),
        ('22', '190'),
    ]
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control':
        'no-cache',
        'Connection':
        'keep-alive',
        'Host':
        'www.cdst.gov.cn',
        'Pragma':
        'no-cache',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        for type_ in self.type_lst:
            for request in self.create_request(type_):
                yield request

    def parse(self, response):
        base_url = 'http://www.cdst.gov.cn/Readnews.asp?NewsID={}'
        for sel in response.xpath(r'//div[@class="listline"]/li'):
            title = sel.xpath(r'a/@title').extract_first(default='').strip()
            unique_id = sel.xpath('a/@href').extract_first(default='').strip()
            date = sel.xpath(r'span/text()').extract_first(default='').strip()
            lst = [title, unique_id, date]
            if not all(lst):
                logging.warning(f'{response.url}: get data failed')
                continue
            url = base_url.format(unique_id.split('=')[-1])
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue
            date = date.strip('[').strip(']')
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title
            item['source'] = '成都市科学技术局'
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)

    def parse_item(self, response):
        item = response.meta['item']
        selector = etree.HTML(response.body)
        regex = r'//div[@class="news_content"]'
        content = response.xpath(regex).xpath('string(.)').extract_first(
            default='').strip()
        # if content == '':
        #     logging.warning(f'{item["url"]}: content is none')
        #     return
        item['summary'] = content[:100] if (content != '') else item['title']
        try:
            content = etree.tostring(selector.xpath(regex)[0],
                                     encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        yield item

    def create_request(self, type_):
        params = {
            'TypeID': type_[0],
            'BigClassID': type_[1],
        }
        url = 'http://www.cdst.gov.cn/Type.asp'
        for page in range(1, self.max_page + 1):
            params.update({'page': str(page)})
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     formdata=params,
                                     headers=self.headers)
Example #8
0
class CdhtSpider(scrapy.Spider):
    name = 'cdht'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control':
        'no-cache',
        'Connection':
        'keep-alive',
        'Host':
        'www.cdht.gov.cn',
        'Pragma':
        'no-cache',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        url = 'http://www.cdht.gov.cn/zwgktzgg/index.jhtml'
        yield scrapy.FormRequest(url, method='GET', headers=self.headers)

    def parse(self, response):
        page_count = re.findall(r'共\d+条记录\s*\d+/(\d+)\s*页|$'.encode('utf-8'),
                                response.body)[0]
        if page_count == b'':
            raise Exception('get page count failed')

        base_url = 'http://www.cdht.gov.cn/zwgktzgg/index_{}.jhtml'
        page_count = min(int(page_count), self.max_page)
        for i in range(1, int(page_count) + 1):
            url = base_url.format(i)
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     callback=self.parse_page)

    def parse_page(self, response):
        regex = '//div[@class="news-list-list"]/table[@class="table"]/tbody/tr'
        for sel in response.xpath(regex):
            link = sel.xpath(r'td[1]/a/@href').extract_first(
                default='').strip()
            title = sel.xpath(r'td[1]/a/text()').extract_first(
                default='').strip()
            source = sel.xpath(r'td[2]/text()').extract_first(
                default='').strip()
            date = sel.xpath(r'td[3]/span/text()').extract_first(
                default='').strip()
            lst = [link, title, source, date]
            if not all(lst):
                logging.warning(f'{response.url}: get data failed')
                continue
            url = link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue
            date = date.strip('[').strip(']')
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['source'] = source
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '高新区'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)

    def parse_item(self, response):
        selector = etree.HTML(response.body)
        item = response.meta['item']
        regex = r'//div[@id="d_content"]'
        title = response.xpath(r'//div[@class="page"]/h1/text()'
                               ).extract_first(default='').strip()
        content = response.xpath(regex).xpath('string(.)').extract_first(
            default='').strip()
        if (title == '') and (content == ''):
            logging.warning(f'{item["url"]}: title and content is none')
            return
        item['summary'] = content[:100] if (content != '') else title
        try:
            content = etree.tostring(selector.xpath(regex)[0],
                                     encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        item['title'] = title
        yield item
Example #9
0
class ZgzzscxdSpider(scrapy.Spider):
    name = 'zgzzscxd'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    ntys = ['1', '3']
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control':
        'no-cache',
        'Connection':
        'keep-alive',
        'Content-Type':
        'application/x-www-form-urlencoded',
        'Host':
        'www.zgzzscxd.com',
        'Origin':
        'http://www.zgzzscxd.com',
        'Pragma':
        'no-cache',
        'Referer':
        'http://www.zgzzscxd.com/NewsList.aspx?NTY=1',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        url = 'http://www.zgzzscxd.com/NewsList.aspx?NTY={}'
        for nty in self.ntys:
            yield scrapy.FormRequest(url.format(nty),
                                     method='GET',
                                     headers=self.headers)

    def parse(self, response):
        page_count = max([
            int(sel.xpath('text()').extract_first(default=''))
            for sel in response.xpath(r'//div[@id="AspNetPager1"]/a')
            if sel.xpath('text()').extract_first(default='').isdigit()
        ])
        page_count = min([page_count, self.max_page])
        for page in range(2, page_count + 1):
            url = 'http://www.zgzzscxd.com/NewsList.aspx?NTY=1'
            form_data = {
                '__VIEWSTATE':
                response.xpath(r'//*[@id="__VIEWSTATE"]/@value').extract()[0],
                '__EVENTTARGET':
                'AspNetPager1',
                '__EVENTARGUMENT':
                str(page),
                'AspNetPager1_input':
                '2',
            }
            yield scrapy.FormRequest(url,
                                     method='POST',
                                     headers=self.headers,
                                     formdata=form_data,
                                     callback=self.parse_page)
        for request in self.parse_page(response):
            yield request

    def parse_page(self, response):
        base_url = 'http://www.zgzzscxd.com/'
        for sel in response.xpath('//li[@class="clearfix"]'):
            item = GovInfoItem()
            link = sel.xpath('a/@href').extract_first(default='').strip()
            title = sel.xpath('a/@title').extract_first(default='').strip()
            lst = [link, title]
            if not all(lst):
                logging.warning(f'{response.url}: get data failed')
                continue
            url = base_url + link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue

            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title
            item['source'] = '成都市经济和信息化委员会'
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)

    def parse_item(self, response):
        item = response.meta['item']
        selector = etree.HTML(response.body)
        regex = r'//div[@class="aa5"]'
        date = response.xpath(r'//div[@class="aa2"]/span[2]/text()'
                              ).extract_first(default='').strip()
        content = response.xpath(regex).xpath('string(.)').extract_first(
            default='').strip()
        if date == '':
            logging.warning(f'{item["url"]}: date is none')
            return
        date = date.replace(':', ':').split(':', 1)[-1].replace('/', '-')
        date = time.strftime("%Y-%m-%d %H:%M:%S",
                             time.strptime(date, "%Y-%m-%d %H:%M:%S"))
        item['summary'] = content[:100] if (content != '') else item['title']
        try:
            content = etree.tostring(selector.xpath(regex)[0],
                                     encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        item['date'] = date
        yield item
Example #10
0
class SczwfwSpider(scrapy.Spider):
    name = 'sczwfw'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    start_urls = [
        'http://www.sczwfw.gov.cn:82/10000/10006/10008/index.shtml',
        'http://www.sczwfw.gov.cn:82/10000/10006/10002/index.shtml',
        'http://www.sczwfw.gov.cn:82/10000/10010/index.shtml',
        'http://www.sczwfw.gov.cn:82/10000/10003/index.shtml',
    ]
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control':
        'no-cache',
        'Connection':
        'keep-alive',
        'Host':
        'www.sczwfw.gov.cn:82',
        'Pragma':
        'no-cache',
        'Referer':
        'http://www.sczwfw.gov.cn/policylist.aspx?news=tzgg',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'base_url': url})

    def parse(self, response):
        page_count = response.xpath(r'//input[@id="hPageCount"]/@value'
                                    ).extract_first(default='').strip()
        if page_count in [b'', '']:
            raise Exception('get page count failed')
        base_url = response.meta['base_url']
        page_count = min([int(page_count), self.max_page])
        for page in range(1, page_count):
            url = base_url.replace('index', 'index_{}').format(page)
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     callback=self.parse_page)
        for request in self.parse_page(response):
            yield request

    def parse_page(self, response):
        base_url = 'http://www.sczwfw.gov.cn:82'
        for sel in response.xpath('//div[@class="news_r"]//li'):
            item = GovInfoItem()
            link = sel.xpath('.//a/@href').extract_first(default='').strip()
            title = sel.xpath('.//a/@title').extract_first(default='').strip()
            date = sel.xpath('.//em/text()').extract_first(default='').strip()
            lst = [link, title, date]
            if not all(lst):
                logging.warning(f'{response.url}: get data failed')
                continue
            url = base_url + link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue

            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]

            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title
            item['source'] = '四川省人民政府办公厅'
            item['origin'] = self.name
            item['date'] = date
            item['type'] = 'web'
            item['location'] = '四川省'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)

    def parse_item(self, response):
        item = response.meta['item']
        selector = etree.HTML(response.body)
        regex = r'//div[@class="deta_ct"]'
        content = response.xpath(regex).xpath('string(.)').extract_first(
            default='').strip()
        # if content == '':
        #     logging.warning(f'{item["url"]}: date or content is none')
        #     return
        summary = content[:100] if (content != '') else item['title']
        try:
            content = etree.tostring(selector.xpath(regex)[0],
                                     encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['summary'] = summary.replace('
',
                                          '').replace('em{font-style:normal;}',
                                                      '')
        item['content'] = content.decode('utf-8').replace('
', '').replace(
            'em{font-style:normal;}', '')
        yield item
Example #11
0
class CdsjxwSpider(scrapy.Spider):
    name = 'cdsjxw'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    base_url = 'http://www.cdgy.gov.cn'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control':
        'no-cache',
        'Connection':
        'keep-alive',
        'Host':
        'www.cdgy.gov.cn',
        'Pragma':
        'no-cache',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    }

    custom_settings = {
        'COOKIES_ENABLED': True,
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        url = 'http://www.cdgy.gov.cn/cdsjxw/c132946/zwxx.shtml'
        yield scrapy.FormRequest(url, method='GET', headers=self.headers)

    def parse(self, response):
        page_count = re.findall(
            r'createPageHTML\(\'page_div\',(\d+),.*?\)|$'.encode('utf-8'),
            response.body)[0]
        if page_count == b'':
            raise Exception('get page count failed')

        referer = response.url
        page_count = min(int(page_count), self.max_page)
        base_url = 'http://www.cdgy.gov.cn/cdsjxw/c132946/zwxx_{}.shtml'
        for i in range(2, int(page_count) + 1):
            headers = copy.deepcopy(self.headers)
            headers.update({'referer': referer})
            url = base_url.format(i)
            referer = url
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     callback=self.parse_page)

        for request in self.parse_page(response):
            yield request

    def parse_page(self, response):
        regex = '//div[@class="newlist_left_cont"]/ul'
        for sel in response.xpath(regex):
            link = sel.xpath(r'li[1]/a/@href').extract_first(
                default='').strip()
            text = sel.xpath(r'li[2]/text()').extract_first(default='').strip()
            title = sel.xpath(r'li[1]/a/@title').extract_first(
                default='').strip()
            lst = [link, title, text]
            if not all(lst):
                logging.warning(f'{response.url}.{link}: get data failed')
                continue
            if 'http' not in link:
                url = self.base_url + link
            else:
                url = link
            if 'cdgy.gov.cn' not in url:
                logging.warning(f'{url} is out the domain')
                continue
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue
            text = ''.join(text.split())
            text = re.sub(r'\s|:|:', '', text)
            promulgator, date = re.findall(r'来源(.*?)发布时间(\d{4}-\d{2}-\d{2})',
                                           text)[0]
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title.strip()
            item['source'] = promulgator
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)

    def parse_item(self, response):
        selector = etree.HTML(response.body)
        item = response.meta['item']
        regexs = [r'//div[@id="top"]', r'//div[@class="main-show-txt"]']
        for regex in regexs:
            content = response.xpath(regex).xpath('string(.)').extract_first(
                default='').strip()
            item['summary'] = content[:100] if (
                content != '') else item['title']
            try:
                content = etree.tostring(selector.xpath(regex)[0],
                                         encoding='utf-8')
            except Exception as err:
                continue
            else:
                break
        else:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        yield item
Example #12
0
class CdkjfwSpider(scrapy.Spider):
    name = 'cdkjfw'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    start_urls = [
        'http://www.cdkjfw.com/list/dynamic.html?Id=@bGMZdxP2kmJSJ/qulgXZw==',
        'http://www.cdkjfw.com/list/dynamic.html?Id=0eBSLw868g7MG9PJmfLY0w=='
    ]
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
        'Cache-Control':
        'no-cache',
        'Host':
        'www.cdkjfw.com',
        'Pragma':
        'no-cache',
        'Proxy-Connection':
        'keep-alive',
        'Referer':
        'http://www.cdkjfw.com/list/dynamic.html?Id=@bGMZdxP2kmJSJ/qulgXZw==&page=2',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,
                                 method='GET',
                                 headers=self.headers,
                                 meta={'url': url})

    def parse(self, response):
        page_count = max([
            int(sel.xpath('text()').extract_first(default=''))
            for sel in response.xpath(r'//div[@id="Pages"]/a')
            if sel.xpath('text()').extract_first(default='').isdigit()
        ])
        page_count = min([page_count, self.max_page])
        base_url = response.meta['url']
        for page in range(2, page_count + 1):
            url = base_url + f'&page={page}'
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     callback=self.parse_page)

        for request in self.parse_page(response):
            yield request

    def parse_page(self, response):
        base_url = 'http://www.cdkjfw.com'
        regex = '//ul[@class="point-list"]/li'
        for sel in response.xpath(regex):
            link = sel.xpath(r'div/a/@href').extract_first(default='').strip()
            title = sel.xpath(r'div/a/text()').extract_first(
                default='').strip()
            date = sel.xpath(r'div/span/text()').extract_first(
                default='').strip()
            lst = [link, title, date]
            if not all(lst):
                logging.warning(f'{response.url}--{link}: get data failed')
                continue
            url = base_url + link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue
            date = date.strip('[').strip(']')
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title
            item['source'] = '成都生产力促进中心'
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)

    def parse_item(self, response):
        selector = etree.HTML(response.body)
        item = response.meta['item']
        regex = r'//div[@class="artical_content"]'
        content = response.xpath(regex).xpath('string(.)').extract_first(
            default='').strip()
        # if content == '':
        #     logging.warning(f'{item["url"]}: content is none')
        #     return
        item['summary'] = content[:100] if (content != '') else item['title']
        try:
            content = etree.tostring(selector.xpath(regex)[0],
                                     encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        yield item
Example #13
0
class InnocomSpider(scrapy.Spider):
    name = 'innocom'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True)
    start_urls = [
        'http://www.innocom.gov.cn/gxjsqyrdw/xxtg/list.shtml',
    ]
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Host': 'www.innocom.gov.cn',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def parse(self, response):
        base_url = 'http://www.innocom.gov.cn/gxjsqyrdw/xxtg/list_{}.shtml'
        page_count = re.findall(r'createPageHTML\(\'page_div\',(\d+),.*?\)|$'.encode('utf-8'), response.body)[0]
        if page_count in [b'', '']:
            raise Exception('get page count failed')
        page_count = min(int(page_count), self.max_page)
        for page in range(2, page_count+1):
            url = base_url.format(page)
            yield scrapy.FormRequest(url, method='GET', headers=self.headers, callback=self.parse_page)

        for request in self.parse_page(response):
            yield request

    def parse_page(self, response):
        base_url = 'http://www.innocom.gov.cn'
        regex = '//div[@class="list7"]/ul/li'
        for sel in response.xpath(regex):
            link = sel.xpath(r'a/@href').extract_first(default='').strip()
            title = sel.xpath(r'a/text()').extract_first(default='').strip()
            date = sel.xpath(r'span/text()').extract_first(default='').strip()
            lst = [link, title, date]
            if not all(lst):
                logging.warning(f'{response.url}--{link}: get data failed')
                continue
            url = base_url + link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}):
                logging.warning(f'{url} is download already, unique_id: {unique_id}')
                continue
            date = date.strip('[').strip(']')
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title
            item['source'] = '科技部火炬高技术产业开发中心'
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '高新区'
            item['crawled'] = 1
            yield scrapy.FormRequest(url, method='GET', headers=self.headers,
                                     meta={'item': item}, callback=self.parse_item)

    def parse_item(self, response):
        selector = etree.HTML(response.body)
        item = response.meta['item']
        regex = r'//div[@id="content"]'
        content = response.xpath(regex).xpath('string(.)').extract_first(default='').strip()
        # if content == '':
        #     logging.warning(f'{item["url"]}: content is none')
        #     return
        item['summary'] = content[:100] if (content != '') else item['title']
        try:
            content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        item['content'] = content.decode('utf-8').replace('
', '')
        yield item
Example #14
0
class CdhrsipSpider(scrapy.Spider):
    name = 'cdhrsip'
    download_delay = 5
    max_page = 5
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING),
                            ('origin', pymongo.DESCENDING)],
                           unique=True)
    type_lst = ['001', '101']
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Length': '0',
        'Host': 'www.cdhrsip.com',
        'Origin': 'http://www.cdhrsip.com',
        'Pragma': 'no-cache',
        'Referer': 'http://www.cdhrsip.com/',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }
    base_params = {
        'notifi': '1',
        'pageNo': '5',
        'pageSize': '10',
        'orderPublish': '1'
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
        },
    }

    def start_requests(self):
        for type_ in self.type_lst:
            for request in self.create_request(type_):
                yield request

    def parse(self, response):
        base_url = 'http://www.cdhrsip.com/article/newsInfo?id={}'
        json_data = json.loads(response.body)
        records = json_data['records']
        for record in records:
            url = base_url.format(record['id'])
            unique_id = get_md5(str(record['id']))
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue
            selector = etree.HTML(record['content'])
            summary = selector.xpath('string(.)').strip()[:100]
            summary = summary if (summary not in [b'', ''
                                                  ]) else record['title']

            date = record['publishTime']
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = record['title']
            item['summary'] = summary
            item['source'] = record['author']
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            item['content'] = record['content']
            yield item

    def create_request(self, type_):
        params = copy.deepcopy(self.base_params)
        params.update({'type': type_})
        url = 'http://www.cdhrsip.com/article/list'
        for page in range(1, self.max_page + 1):
            params.update({'pageNo': str(page)})
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     formdata=params,
                                     headers=self.headers)
Example #15
0
class WxgzhTaskSpider(scrapy.Spider):
    name = 'wxgzh_task'
    download_delay = 5
    days = 5
    task_col = get_col('wxgzh_task')
    task_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True)
    mongo_col = get_col(MONGODB_COLLECTION)
    mongo_col.create_index([("unique_id", pymongo.DESCENDING), ('origin', pymongo.DESCENDING)], unique=True)
    redis_con = get_redis_client()
    redis_key = 'wxgzh'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Host': 'weixin.sogou.com',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    }

    custom_settings = {
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.WxgzhTaskPipeline': 100,
        },
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
            'gov_info.middlewares.WxgzhTaskRotateProxiesSpiderMiddleware': 100,
        },
        # # redis
        # 'SCHEDULER': "scrapy_redis.scheduler.Scheduler",
        # 'DUPEFILTER_CLASS': "gov_info.common.utils.MyRFPDupeFilter",
        # 'SCHEDULER_PERSIST': True,
        # 'REDIS_START_URLS_AS_SET': True
    }

    def start_requests(self):
        url = 'http://weixin.sogou.com/weixin'
        self.task_col.update({'crawled': {'$ne': 1}}, {'$set': {'crawled': 0}}, multi=True)
        for wx_info in WXINFOS:
            self.create_task(wx_info)
        while True:
            task = self.task_col.find_one_and_update({'crawled': 0}, {'$set': {'crawled': 2}})
            if task is None:
                break
            params = task['params']
            self.headers.update({'Referer': task['referer']})
            yield scrapy.FormRequest(url, method='GET', formdata=params, headers=self.headers, meta={'task': task})

    def parse(self, response):
        result = 1
        url = response.url
        task = response.meta['task']
        params = task['params']
        origin = task['origin']
        total = re.findall(r'找到约(\d+)条结果|$'.encode('utf-8'), response.body)[0]
        if total != b'' and int(total) > 10:
            result = -1
            logging.error(f'{url}: {params} page too more')
        self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}})

        redis_values = []
        for sel in response.xpath(r'//li[contains(@id, "sogou_vr_")]'):
            item = GovInfoItem()
            unique_id = sel.xpath(r'./@d').extract_first(default='').strip()
            date = sel.xpath(r'div/div/@t').extract_first(default='').strip()
            source = sel.xpath(r'div/div/a/text()').extract_first(default='').strip()
            link = sel.xpath(r'div/h3/a/@href').extract_first(default='').strip()
            name = sel.xpath(r'div/div/a/text()').extract_first(default='').strip()
            lst = [unique_id, date, source, link]
            if not all(lst):
                result = -1
                logging.warning(f'{url}: {params}.{link}: get data failed')
                self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}})
                continue
            unique_id = get_md5(unique_id)
            if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{origin}'}]}):
                logging.warning(f'{url} is download already, unique_id: {unique_id}')
                continue
            if task['name'] != name:
                logging.warning(f'{url}: {params}.{link}: is not publish from {task["name"]}')
                continue
            url = link.replace('http', 'https')
            item['url'] = url
            item['task_unique_id'] = task['unique_id']
            item['unique_id'] = unique_id
            try:
                item['summary'] = sel.xpath(r'./div/p[@class="txt-info"]')[0].xpath(
                    'string(.)').extract_first('').strip()
            except Exception as err:
                logging.warning(f'{url}: {params}.{link}: get summary failed')
                item['summary'] = ''
            item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(date)))
            item['source'] = source
            item['origin'] = origin
            item['type'] = 'wxgzh'
            item['crawled'] = 0
            item['location'] = task['location']
            redis_values.append(json.dumps({'item': dict(item)}))
        if redis_values:
            self.redis_con.sadd("{}".format(self.redis_key), *redis_values)

    def create_task(self, wx_info):
        url = 'http://weixin.sogou.com/weixin'
        params = {
            'type': '2',
            'ie': 'utf8',
            'query': wx_info.name,
            'tsn': '5',
            'interation': '',
            'wxid': wx_info.wx_id,
            'usip': wx_info.name,
        }
        date = datetime.datetime.now()
        days = self.days
        while days >= 0:
            t = date.strftime("%Y-%m-%d")
            unique_id = f'{wx_info.origin}-{t}'
            data = {
                'url': url,
                'name': wx_info.name,
                'unique_id': unique_id,
                'origin': wx_info.origin,
                'params': params,
                'referer': wx_info.referer,
                'crawled': 0,
                'location': wx_info.location,
            }
            date -= datetime.timedelta(days=1)
            days -= 1
            params.update({'ft': t, 'et': t})
            try:
                self.task_col.insert(data)
            except DuplicateKeyError:
                # logging.warning(f'task unique_id {wx_info.origin}-{t} already exists')
                if datetime.datetime.now().strftime("%Y-%m-%d") == t:
                    self.task_col.update({'unique_id': unique_id}, {'$set': {'crawled': 0}})
Example #16
0
class WxgzhSpider(RedisSpider):
    name = 'wxgzh'
    download_delay = 5
    handle_httpstatus_list = [403, 408, 564, 503]
    task_col = get_col('wxgzh_task')
    mongo_col = get_col(MONGODB_COLLECTION)
    redis_con = get_redis_client()
    redis_key = 'wxgzh'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'Cache-Control':
        'no-cache',
        'Connection':
        'keep-alive',
        'Host':
        'mp.weixin.qq.com',
        'Pragma':
        'no-cache',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    }

    custom_settings = {
        'DOWNLOAD_TIMEOUT': 60,
        'LOG_FILE': f'logs/{name}.log',
        'ITEM_PIPELINES': {
            'gov_info.pipelines.GovInfoPipeline': 100,
            # 'scrapy_redis.pipelines.RedisPipeline': 200,
        },
        'DOWNLOADER_MIDDLEWARES': {
            'gov_info.middlewares.RotateUserAgentMiddleware': 400,
            'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
            'gov_info.middlewares.WxgzhSpiderMiddleware': 100,
        },
        # redis
        'SCHEDULER': "scrapy_redis.scheduler.Scheduler",
        'DUPEFILTER_CLASS': "gov_info.common.utils.MyRFPDupeFilter",
        'SCHEDULER_PERSIST': True,
        'REDIS_START_URLS_AS_SET': True
    }

    # def start_requests(self):
    #     self.mongo_col.update({'$and': [{'type': 'wxgzh'}, {'crawled': {'$ne': 1}}]},
    #                           {'$set': {'crawled': 0}}, multi=True)
    #     while True:
    #         task = self.mongo_col.find_one_and_update({'crawled': 0}, {'$set': {'crawled': 2}})
    #         if task is None:
    #             time.sleep(5)
    #             continue
    #         yield scrapy.FormRequest(task['url'], method='GET', headers=self.headers, meta={'task': task})

    def parse(self, response):
        json_data = response.meta['json_data']
        item = GovInfoItem(json_data['item'])
        selector = etree.HTML(response.body)
        regex = r'//*[@id="js_content"]'
        title = response.xpath(r'//*[@id="activity-name"]/text()'
                               ).extract_first(default='').strip()
        content = response.xpath(regex).xpath('string(.)').extract_first(
            '').strip()
        if (content == '') and (title == ''):
            logging.warning(f'{item["url"]}: title and content is none')
            self.task_col.update({'unique_id': item['task_unique_id']},
                                 {"$set": {
                                     'crawled': -1
                                 }})
            return
        try:
            content = etree.tostring(selector.xpath(regex)[0],
                                     encoding='utf-8')
        except Exception as err:
            logging.error(f'{item["url"]}: get content failed')
            return
        if item['summary'] == '':
            item['summary'] = content[:100] if (
                content != '') else item['title']
        item['content'] = content.decode('utf-8').replace('
', '')
        item['title'] = title.strip()
        item['unique_id'] = item['unique_id']
        item['crawled'] = 1
        yield item