Beispiel #1
0
    def is_crawled(self, url_content):
        if not self.conn:
            return -1
        else:
            content = md5_encode(url_content)
            ex = self.conn.sadd('keys', content)

            return ex
Beispiel #2
0
    def _real_parse_item(self, response):

        item = GovItem(domain_collection=None,
                       html=None,
                       pdf=[],
                       xls=[],
                       images=[],
                       others=[])
        # 1.保存html

        filename = make_file_name(response.url, 'html')
        item['html'] = filename

        domain = response.url.split('/')[2]
        item['domain_collection'] = md5_encode(domain)
        abpath = DATA_DIR + item['domain_collection']

        if not os.path.exists(abpath):  # 第一次创建文件夹

            os.makedirs(abpath)

        with open(abpath + '/' + filename, 'wb') as f:
            f.write(response.body)

        # 2.保存其他资源
        images = response.selector.xpath('//img/@src').extract()
        pdf = response.selector.xpath(
            '//a/@href[contains(.,".pdf")]').extract()
        xls = response.selector.xpath(
            '//a/@href[contains(.,".xls")]').extract()
        urls = images + pdf + xls

        if urls:
            for url in urls:
                """
                url = response.urljoin(url)
                self.logger.info(url)
                yield scrapy.Request(
                    "http://localhost:8050/render.html?url=" + url,
                    callback=self.save_files, 
                    cb_kwargs=dict(item=item)
                )
                """
                yield response.follow(url,
                                      callback=self.save_files,
                                      cb_kwargs=dict(item=item))
Beispiel #3
0
    def save_files(self,response,item):

        abpath = DATA_DIR + item['domain_collection']
        filename = md5_encode(response.url)+'.'+response.url.split('.')[-1]

        with open(abpath +'/'+ filename, 'wb') as f:
            f.write(response.body)
            self.logger.info('Files downloading...' +filename)

        if filename.endswith('.pdf'):
            item['pdf'].append(filename)
        elif filename.endswith('.xls'):
            item['xls'].append(filename)
        elif filename.endswith('png') or filename.endswith('jpg'):
            item['images'].append(filename)
        else:
            item['others'].append(filename)
        return item