Python sudaMainItem Examples

Programming Language: Python

Namespace/Package Name: mySpider.items

Method/Function: sudaMainItem

Examples at hotexamples.com: 2

Python sudaMainItem - 2 examples found. These are the top rated real world Python examples of mySpider.items.sudaMainItem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def parse(self, response):
        # self.count = self.count+1
        #print('这是第', self.count, '个页面')
        print('当前爬取页面' + response.request.url.strip('*/'))
        print('当前集合大小', len(self.url_pool))
        titles = response.xpath('//a/@href').extract()

        basic_url = response.request.url.strip('*/')

        # 对于url进行拼接处理
        for url in titles:
            # print(url)
            item = sudaMainItem()
            matchFullUrl = re.match(r'^(http|https)://([\w.]+/?)\S*', url,
                                    re.M | re.I)
            matchRelateUrl = re.match(r'^/([\w.]?/?)\S*', url, re.M | re.I)
            matchRelateUrl2 = re.match(r'^[^/]([\w.]?/?)\S*', url, re.M | re.I)

            if url:
                if matchFullUrl:
                    true_url = url
                    # print('原始url', true_url)
                elif matchRelateUrl:
                    true_url = basic_url + url
                    # print('拼接url1', true_url)
                elif matchRelateUrl2:
                    true_url = basic_url + '/' + url
                    # print('拼接url2', true_url)
                else:
                    true_url = url
                    # print('未处理且未匹配', true_url)
                if self.judge_suda(true_url):
                    item['father'] = basic_url
                    item['url'] = true_url
                    self.url_pool.add(true_url)
                    yield item
                # if true_url not in self.url_pool:
                #     item['distinct_url'] = true_url

                # else:
                #     item['distinct_url'] = 'duplicate'
                # yield item
        # url_list = self.getDistinctUrls()
        # print(url_list)
        url_pool_copy = copy.deepcopy(self.url_pool)
        # url_pool_copy = list(self.url_pool)

        for next_url in url_pool_copy:
            # print('这是第', index, '个元素')
            if 'http://' in next_url or 'https://' in next_url:
                yield scrapy.Request(next_url, self.parse, dont_filter=False)
            else:
                yield scrapy.Request('http://' + next_url,
                                     self.parse,
                                     dont_filter=False)

Example #2

Show file

    def parsePage(self, response):
        # self.count = self.count+1
        #print('这是第', self.count, '个页面')
        print('当前爬取页面' + response.request.url.strip('*/'))
        randomdelay = random.randint(0, 4)
        time.sleep(randomdelay)
        print("### random delay: %s s ###" % (randomdelay))
        # print('当前集合大小', len(self.url_pool))
        titles = response.xpath('//a/@href').extract()

        basic_url = response.request.url.strip('*/')

        # 对于url进行拼接处理
        for url in titles:
            # print(url)
            item = sudaMainItem()
            matchFullUrl = re.match(r'^(http|https)://([\w.]+/?)\S*', url,
                                    re.M | re.I)
            # matchRelateUrl = re.match(r'^/([\w.]?/?)\S*', url, re.M | re.I)
            # matchRelateUrl2 = re.match(r'^[^/]([\w.]?/?)\S*', url, re.M | re.I)
            matchUselessUrl = re.match(r'^#([\w.]?/?)\S*', url, re.M | re.I)
            # matchParams = re.match(r'^\?([\w.]?/?)\S*', url, re.M | re.I)
            if url:
                if matchFullUrl:
                    true_url = url
                    # print('原始url', true_url)
                elif matchUselessUrl:
                    true_url = basic_url
                else:
                    true_url = urljoin(basic_url, url)
                    # print('未处理且未匹配', true_url)
                if self.judge_suda(true_url):
                    item['father'] = basic_url
                    item['url'] = true_url
                    self.url_pool.add(true_url)
                    yield item