コード例 #1
0
def job_gdeegd():

    # 文章列表页
    urls = []
    for i in range(1, 64):
        if (i == 1):
            url = "http://gdee.gd.gov.cn/ggtz3126/index.html"
        else:
            url = "http://gdee.gd.gov.cn/ggtz3126/index_%s.html" % str(i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Connection':
            'keep-alive',
            'Pragma':
            'no-cache',
            'Cache-Control':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer':
            'http://gdee.gd.gov.cn/ggtz3126/index_3.html',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cookie':
            'm_bt=yes; openstack_cookie_insert=62355311; _gscu_1815356153=89127015q6kzl720; _gscbrs_1815356153=1; UM_distinctid=171ff59ebaa814-0ab1f7a365db41-d373666-1fa400-171ff59ebab197; CNZZDATA3588456=cnzz_eid%3D214537553-1589123201-http%253A%252F%252Ftest.gzjirui.com%252F%26ntime%3D1589123201; _gscs_1815356153=89127015ev2u6d20|pv:2'
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath(
            "/html/body/div/div[3]/div[2]/div/div[2]/ul/li[3]/div/a/@href"
        ).extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='gdeegd.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='gdeegd.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of gdeegd (Used: %s)' %
                 (time.time() - t1))
コード例 #2
0
def job_scjgjjs():

    # 文章列表页
    urls = []
    for i in range(1, 21):
        if (i == 1):
            url = "http://scjgj.jiangsu.gov.cn/col/col70311/index.html"
        else:
            url = "http://scjgj.jiangsu.gov.cn/col/col70311/index.html?uid=277431&pageNum=%s" % str(
                i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Connection':
            'keep-alive',
            'Pragma':
            'no-cache',
            'Cache-Control':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer':
            'http://test.gzjirui.com/magicflu/html/form/records2.jsp?spaceId=02393294-327d-43ed-835e-d8fe778772a8&formId=-1',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cookie':
            '__jsluid_h=8011b3a4cb561d1de121a1fa390ab4df; _gscu_1226760719=8861650310idmp17; _gscbrs_1226760719=1; yunsuo_session_verify=75a060942bec9e14902b3b5453719ad1; _gscs_1226760719=t89123468mg3q2f70|pv:3'
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath('//*[@id="277431"]/div/li[1]/a/@href').extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='gdstc.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='gdstc.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of gdstc (Used: %s)' %
                 (time.time() - t1))
コード例 #3
0
def job_comgdgov():

    # 文章列表页
    urls = []
    for i in range(1, 16):
        if (i == 1):
            url = "http://com.gd.gov.cn/zwgk/gggs/index.html"
        else:
            url = "http://com.gd.gov.cn/zwgk/gggs/index_%s.html" % str(i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Connection':
            'keep-alive',
            'Pragma':
            'no-cache',
            'Cache-Control':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer':
            'http://com.gd.gov.cn/zwgk/gggs/index_16.html',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cookie':
            'UM_distinctid=171ff59ebaa814-0ab1f7a365db41-d373666-1fa400-171ff59ebab197; openstack_cookie_insert=81202878'
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath(
            "/html/body/div[2]/div/div[2]/ul/li[4]/a/@href").extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='comgdgov.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='comgdgov.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of comgdgov (Used: %s)' %
                 (time.time() - t1))
コード例 #4
0
def job_gdstc():

    # 文章列表页
    urls = []
    for i in range(1, 21):
        if (i == 1):
            url = "http://gdstc.gd.gov.cn/zwgk_n/tzgg/index.html"
        else:
            url = "http://gdstc.gd.gov.cn/zwgk_n/tzgg/index_%s.html" % str(i)
        urls.append(url)

    # 文章详情页
    for url in urls:
        t1 = time.time()
        headers = {
            'Accept-Encoding': "gzip, deflate",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Upgrade-Insecure-Requests': "1",
            'User-Agent':
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
            'Accept':
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'Referer': "http://gdstc.gd.gov.cn/zwgk_n/",
            'Cookie':
            "zh_choose=s; zh_choose=s; openstack_cookie_insert=76667651",
            'Connection': "keep-alive"
        }
        response = requests.request("GET", url, headers=headers)
        selector = Selector(text=response.text)
        urls = selector.xpath(
            "/html/body/div[2]/div[2]/div[2]/ul/li/a/@href").extract()
        for u in urls:
            if not FILTER.isContains(u):  # 如果还没有爬过
                FILTER.insert(u)  # 标志为已爬
                seed = Seed(url=u, downloader='gdstc.crawl0')
                push_seed(seed)
            # seed = Seed(url=u, downloader='gdstc.crawl0')
            # push_seed(seed)
        writeLog('Finish add the seeds of gdstc (Used: %s)' %
                 (time.time() - t1))
コード例 #5
0
ファイル: parsers.py プロジェクト: officeset/MSpiders
 def parse0(self, html_packed):
     """
     seed键:
         'url'(必选): 待爬URL。
         'spider'(必选): 调用哪个下载规则, 格式为'Spider_amrgd.crawl1' 或 'Spider_amrgd'(默认使用crawl0方法)。
     field键:
         'pipeline_dbType'(必选): 数据库类型, 值为: mongo/es/redis/hbase/ssdb/mysql。
         'pipeline_keyName': 如果将数据存于Redis, 可带上此字段。如果没有带上则使用各爬虫settings.py中的keyName, 如果没有设置则默认使用'default'。
         'pipeline_collection': 如果将数据存于MongoDB, 可带上此字段。如果没有带上则使用各爬虫settings.py中的collection, 如果没有设置则默认使用'default'。
         'pipeline_index': 如果将数据存于ES, 可带上此字段, 如果没有带上则使用各爬虫settings.py中的index, 如果没有设置则默认使用'default'。
         'pipeline_doc_type': 如果将数据存于ES, 可带上此字段, 如果没有带上则使用各爬虫settings.py中的doc_type, 如果没有设置则默认使用'default'。
     """
     seeds = []
     fields = []
     try:
         selector = Selector(
             text=html_packed['html'].decode('utf-8', 'ignore'))
         data = {}
         url = html_packed['url']
         data['spider'] = '广东省市场监督管理局(知识产权局)'
         data['url'] = url
         data['type'] = '1'
         data['title'] = selector.xpath(
             '//h1[@class="article_t"]/text()').extract()[0]
         data['source'] = selector.xpath(
             '/html/body/div[4]/div[2]/div/div[1]/span[1]/text()').extract(
             )[0]
         data['pushTime'] = selector.xpath(
             '/html/body/div[4]/div[2]/div/div[1]/span[2]/text()').extract(
             )[0].replace("时间  :  ", '')[0:10]
         page = selector.xpath('//div[@class="article_con"]').extract()[0]
         files = []
         for at in selector.xpath('//div[@class="article_con"]//a'):
             file_url = at.xpath('@href').extract()[0]
             file_name = at.xpath('text()').extract()[0]
             file_type = file_url[-3:]
             file = {
                 'file_url': file_url,
                 'file_name': file_name,
                 'file_type': file_type
             }
             files.append(file)
         filters = FilterTag()
         content = filters.stripTagSimple(page)
         data['content'] = content
         data['len'] = len(content)
         data['page'] = page
         data['fid'] = md5(url)
         data['files'] = files
         item = Item(data=data, dbType='ffwb')
         fields.append(item)
     except Exception as e:
         self.logger.error(str(e))
     return fields, seeds