Beispiel #1
0
    def parse_shop(self, response):
        """
        解析提取model数据
        :param response:
        """
        item = Item()
        item_loader = ItemLoader(item=item, selector=response)
        for field in self.model_xpath:
            item.fields[field] = Field()
            if 'model_url' in field:
                item_loader.add_value(field, response.url)
            else:
                item_loader.add_xpath(field, self.model_xpath[field])

        item = self.format_item(item_loader.load_item())

        yield item

        # 拼接用户评论链接
        user_url = response.xpath(self.user_url).extract()
        for uid in user_url:
            brand = uid.split('-')[0]
            model = uid.split('-')[1]
            yield Request(
                self.user_review_url.format(brand=brand, model=model),
                self.parse_comment)
Beispiel #2
0
    def parse_forum(self, response):
        """
        解析评论内容
        """
        item = Item()
        itemloader = ItemLoader(item=item, selector=response)
        for field in self.forum_xpath:
            item.fields[field] = Field()
            if 'main_body' in field:
                content = re.compile(r'<html><body><.*?>(.*?)</body></html>',
                                     re.S | re.M)
                content = content.findall(response.text)
                content = re.sub(r'<script>.*?</script>', '', ''.join(content))
                content = re.sub(r'[\r\n]', '', content)
                content = re.sub(r'<div .*?>.*?</div>', '', content)
                content = re.sub(r'<style .*?>.*?</style>', '', content,
                                 re.S | re.M)
                content = re.sub(r'&.*?;', '', content)
                content = re.sub(r'<.*?>', '', content, re.M | re.I)
                content = re.sub('  ', '', content)
                itemloader.add_value(field, content)
            elif 'content_url' in field:
                itemloader.add_value(field, response.url)
            else:
                itemloader.add_xpath(field, self.forum_xpath[field])

        item = self.format_item(itemloader.load_item())

        yield item
def test_add_stats_item_scraped_count_by_item_type(spider):
    for _ in range(15):
        spider.crawler.signals.send_catch_log_deferred(
            signal=signals.item_scraped,
            item={"_type": "regular_dict"},
            response="",
            spider=spider,
        )

    for _ in range(20):
        spider.crawler.signals.send_catch_log_deferred(
            signal=signals.item_scraped,
            item=Item(),
            response="",
            spider=spider,
        )

    for _ in range(25):
        spider.crawler.signals.send_catch_log_deferred(
            signal=signals.item_scraped,
            item=TestItem(),
            response="",
            spider=spider,
        )

    stats = spider.crawler.stats.get_stats()

    assert stats.get("spidermon_item_scraped_count") == 60
    assert stats.get("spidermon_item_scraped_count/dict") == 15
    assert stats.get("spidermon_item_scraped_count/Item") == 20
    assert stats.get("spidermon_item_scraped_count/TestItem") == 25
Beispiel #4
0
def PlayerRow(**fields):
    item = Item()
    for field in all_fields:
        item.fields[field] = Field()
    for field, value in fields.iteritems():
        item[field] = value
    return item
Beispiel #5
0
    def parse_user(self, response):
        """
        根据返回的 response 进行数据解析
        :param response: scrapy 框架返回的响应
        """
        result = {
            'user_name': response.meta['user_name'],
            'title': response.meta['title'],
            'date': response.meta['date'],
            'main_body': response.meta['main_body'],
            'content_url': response.meta['content_url'],
            'brand': response.meta['brand']
        }

        for content in response.xpath(self.user_list_xpath):
            item = Item()
            item_loader = ItemLoader(item=item, selector=content)
            for field in self.user_xpath:
                item.fields[field] = Field()
                if 'user_url' in field:
                    item_loader.add_value(field, response.url)
                else:
                    item_loader.add_xpath(field, self.user_xpath[field])

            result.update(item_loader.load_item())
            item = self.format_item(result)
            yield item

        # 用户评论
        user_comment = response.xpath('.//ul/li[@class="Comments"]/a/@href').extract()
        if user_comment:
            yield Request(self.url + user_comment[0], self.parse_comment)
Beispiel #6
0
 def parse_item(self, response):
     item = Item()
     item.fields['url'] = Field()
     item.fields['url_md5'] = Field()
     item.fields['title'] = Field()
     item.fields['pubtime'] = Field()
     item.fields['content'] = Field()
     item.fields['author'] = Field()
     item.fields['site_name'] = Field()
     l = ItemLoader(item=item, response=response)
     url = response.url
     md5 = hashlib.md5()
     l.add_value(u'url', url)
     md5.update(url)
     url_md5 = md5.hexdigest()
     l.add_value(u'url_md5', url_md5)
     l.add_xpath('title', "//h2/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('pubtime', "//p[@class='time']/span[1]/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('content', "//div[@class='content']//text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('author', "//p[@class='time']/span[last()]/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_value("site_name", u"网易号")
     yield l.load_item()
Beispiel #7
0
    def parse(self, response):
        """
        根据返回的 response 进行数据解析
        :param response: scrapy 框架返回的响应
        """
        keyword = response.meta['keyword']
        for complaint in response.xpath(self.list_xpath):
            item = Item()
            item_loader = ItemLoader(item=item, selector=complaint)
            for field in self.content_xpath:
                item.fields[field] = Field()
                if 'content_url' in field:
                    item_loader.add_value(field, response.url)
                elif 'brand' in field:
                    item_loader.add_value(field, keyword)
                else:
                    item_loader.add_xpath(field, self.content_xpath[field])

            # 用户链接
            user_id = complaint.xpath('.//span[@class="Author"]/a/@href').extract()
            for uid in user_id:
                yield Request(self.url + uid, self.parse_user, meta=dict(item_loader.load_item()))

        # 下一页
        next_page = response.xpath('//div[@id="PagerBefore"]/a[last()]/@href').extract()
        if next_page:
            yield Request(self.url + next_page[0], meta={'keyword': keyword}, callback=self.parse)
 def parse(self, response):
     item = Item()
     sel = Selector(response)
     fields = json.loads(self.conf.get("fields"))
     rules = json.loads(self.conf.get("rules"))
     loops = rules.get("rules").get("rules_listxpath")
     if fields.get("fields", "") == "":
         logging.error(u"内容解析未得到!!!")
         yield item
     item.fields["url"] = Field()
     item.fields["spider_jobid"] = Field()
     item["spider_jobid"] = self.spider_jobid
     item.fields['word'] = Field()
     item['word'] = response.meta.get("word")
     # 加载动态库字段建立Field,xpath规则 (方法一)
     for loop in sel.xpath("{}".format(loops)):
         item['url'] = loop.xpath(u"{}".format(fields.get("fields").get("url").get("xpath"))).extract()
         for k in loadMySQL(self.conf.get("spider_name")):
             if fields.get("fields").get(k[2]) != None:
                 item.fields[k[2]] = Field()
                 if fields.get("fields").get(k[2]).keys()[0] == "xpath":
                     item[k[2]] = loop.xpath(u"{}".format(fields.get("fields").get(k[2]).get("xpath"))).extract()
                 elif fields.get("fields").get(k[2]).keys()[0] == "value":
                     item[k[2]] = u"{}".format(fields.get("fields").get(k[2]).get("value"))
         yield item
Beispiel #9
0
class PostItem(Item):
    title=Field()
    content=Field()
    url=Field()
    user=Item()
    fromurl=Field()
    postdate=Field() 
Beispiel #10
0
def describe_request_result_handling():
    single_request = Request("http://test.com")
    mixed_requests = [ Request("http://test.com"), FormRequest("http://test2.com") ]
    complete_mix = mixed_requests + [ Item() ]

    def it_can_extract_request_objects():
        assert requests_in_parse_result([single_request]) == [single_request]

    def it_tolerates_None():
        assert requests_in_parse_result(None) == []
        assert items_in_parse_result(None) == []
        assert count_requests_in_parse_result(None) == 0
        assert count_items_in_parse_result(None) == 0

    def it_tolerates_single_elements():
        assert requests_in_parse_result(single_request) == [single_request]
        assert items_in_parse_result(single_request) == []

    def it_tolerates_and_sorts_out_items_mixed_in_between():
        assert requests_in_parse_result(complete_mix) == mixed_requests

    def it_tolerates_different_request_types():
        assert requests_in_parse_result(mixed_requests) == mixed_requests

    def it_extracts_urls_from_requests():
        urls = urls_from_requests(complete_mix)
        assert len(urls) == 2
        assert "http://test.com" in urls
        assert "http://test2.com" in urls

    def it_counts_the_requests_and_other_results():
        assert count_requests_in_parse_result(complete_mix) == 2
        assert count_items_in_parse_result(complete_mix) == 1
Beispiel #11
0
 def parse(self, response):
     item = Item()
     fields = json.loads(self.conf.get("fields"))
     l = ItemLoader(item, response)
     item.fields["url"] = Field()
     item.fields["spider_jobid"] = Field()
     l.add_value("url", response.url)
     l.add_value("spider_jobid", self.spider_jobid)
     # 加载动态库字段建立Field,xpath规则 (方法一)
     for k in self.keys:
         if fields.get("fields", "") == "":
             logging.error(u"内容解析未得到!!!")
             return l.load_item()
         if fields.get("fields").get(k) != None:
             item.fields[k] = Field()
             if fields.get("fields").get(k).keys()[0] == "xpath":
                 l.add_xpath(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("xpath")),
                     MapCompose(unicode.strip))
             elif fields.get("fields").get(k).keys()[0] == "value":
                 if fields.get("fields").get(k).get("value") == u"{TODAY}":
                     l.add_value(k, u"{}".format(datetime.now()))
                 l.add_value(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("value")))
     return l.load_item()
Beispiel #12
0
 def parse_item(self, response):
     item = Item()
     word = response.meta['word']
     fields = json.loads(self.conf.get("fields"))
     l = ItemLoader(item, response)
     if fields.get("fields", "") == "":
         logging.error(u"内容解析未得到!!!")
         return l.load_item()
     item.fields["url"] = Field()
     item.fields["spider_jobid"] = Field()
     l.add_value("url", response.url)
     l.add_value("spider_jobid", self.spider_jobid)
     item.fields['word'] = Field()
     l.add_value('word', word)
     # 加载动态库字段建立Field,xpath规则 (方法一)
     for k in loadMySQL(self.name_spider)['fields'].keys():
         if fields.get("fields", "") == "":
             logging.error(u"内容解析未得到!!!")
             return l.load_item()
         if fields.get("fields").get(k) != None:
             item.fields[k] = Field()
             if fields.get("fields").get(k).keys()[0] == "xpath":
                 l.add_xpath(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("xpath")),
                     MapCompose(unicode.strip))
             elif fields.get("fields").get(k).keys()[0] == "value":
                 l.add_value(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("value")))
     return l.load_item()
Beispiel #13
0
    def parse(self, response):
        """
        解析网页获取评论
        :param response:响应内容
        """
        search = response.meta['keyword']
        # 评论页
        for complaint in response.xpath(self.list_xpath):
            item = Item()
            item_loader = ItemLoader(item=item, selector=complaint)
            for field in self.content_xpath:
                item.fields[field] = Field()
                if 'content_url' in field:
                    item_loader.add_value(field, response.url)
                elif 'brand' in field:
                    item_loader.add_value(field, search)
                else:
                    item_loader.add_xpath(field, self.content_xpath[field])
            # 拼接user_url链接
            uid = complaint.xpath('.//tr/td[@class="small"]/a/@href').extract()
            yield Request(self.urls + uid[0],
                          self.parse_user,
                          meta=dict(item_loader.load_item()),
                          dont_filter=True)

        # 内容下一页
        next_page = response.xpath(
            '//div[@class="pagelinks"]/a[last()]/@href').extract()
        for page in next_page:
            yield Request(self.urls + page,
                          self.parse,
                          meta={'keyword': search})
 def parse_item(self, response):
     item = Item()
     item.fields['url'] = Field()
     item.fields['url_md5'] = Field()
     item.fields['title'] = Field()
     item.fields['pubtime'] = Field()
     item.fields['content'] = Field()
     item.fields['author'] = Field()
     item.fields['site_name'] = Field()
     l = ItemLoader(item=item, response=response)
     url = response.url
     url = url.replace("http://rym.quwenge.com/baidu_tiaozhuan.php?url=",
                       "")
     md5 = hashlib.md5()
     l.add_value(u'url', url)
     md5.update(url)
     url_md5 = md5.hexdigest()
     l.add_value(u'url_md5', url_md5)
     l.add_xpath('title', "//h1/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('pubtime', "//span[@class='read']/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('content', "//div[@id='content']//text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('author', "//div[@class='name']/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_value("site_name", u"百家号")
     yield l.load_item()
 def parse_item(self, response):
     item = Item()
     item.fields['url'] = Field()
     item.fields['url_md5'] = Field()
     item.fields['title'] = Field()
     item.fields['pubtime'] = Field()
     item.fields['content'] = Field()
     item.fields['author'] = Field()
     item.fields['author_url'] = Field()
     item.fields['site_name'] = Field()
     l = ItemLoader(item=item, response=response)
     url = response.url
     md5 = hashlib.md5()
     l.add_value(u'url', url)
     md5.update(url)
     url_md5 = md5.hexdigest()
     l.add_value(u'url_md5', url_md5)
     l.add_xpath('title', "//h1/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath(
         'pubtime',
         "//p[@class='clearfix']//span/text() | //div[@class='titleLine-gY7DniPB']//span/text()",
         MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('content', "//div[@class='text-3zQ3cZD4']//text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath(
         'author',
         "//p[@class='clearfix']/a[2]/text() | //div[@class='titleLine-gY7DniPB']/p/a[2]/text()",
         MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('author_url',
                 "//div[@class='titleLine-gY7DniPB']/p/a[2]/@href",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_value("site_name", u"大风号")
     yield l.load_item()
    def test_mpm_middleware(self):
        # create fake response
        a = MagicMock()
        a.meta = {'key1': 'value1', 'key2': 'value2'}

        yield_count = 0
        # test all types of results from a spider
        # dicts, items, or requests
        test_list = [{}, Item(), Request('http://istresearch.com')]

        for item in self.mpm.process_spider_output(a, test_list, MagicMock()):
            if isinstance(item, Request):
                self.assertEquals(a.meta, item.meta)
            yield_count += 1

        self.assertEquals(yield_count, 3)

        # 1 debug for the method, 1 debug for the request
        self.assertEquals(self.mpm.logger.debug.call_count, 2)

        # test meta unchanged if already exists
        r = Request('http://aol.com')
        r.meta['key1'] = 'othervalue'

        for item in self.mpm.process_spider_output(a, [r], MagicMock()):
            # key1 value1 did not pass through, since it was already set
            self.assertEquals(item.meta['key1'], 'othervalue')
            # key2 was not set, therefor it passed through
            self.assertEquals(item.meta['key2'], 'value2')
Beispiel #17
0
    def test_process_not_trackable(self):
        pipeline = ItemTrackerPipeline.from_crawler(self.crawler)
        pipeline.storage = mock.Mock()

        expected = Item()
        found = pipeline.process_item(expected, self.spider)
        self.assertEqual(expected, found)

        pipeline.storage.assert_not_called()
Beispiel #18
0
 def second_parse(self, response):
     href_list = response.xpath('//ul[@class="wp100"]/li//div/p[@class="fs22"]/a/@href')
     print(len(href_list), response.url)
     for href in href_list:
         item = Item()
         next_url = 'http://www.sinyi.com.cn/' + href.extract().split('/', 1)[1].split('&cookieuid=')[0]
         item.fields['HouseUrl'] = Field()
         item['HouseUrl'] = next_url
         yield scrapy.Request(next_url, callback=self.third_parse, meta={'item': item})
Beispiel #19
0
    def housing_handle(self, response):
        item = response.meta.get("item")
        item1 = Item()
        PropertyAddress = response.xpath(
            "//div[@class='detailDesc']/text()").extract_first()
        PriceUnit = response.xpath(
            "//span[@class='xiaoquUnitPrice']/text()").extract_first()
        detail_community = response.xpath("//div[@class='xiaoquInfo']")
        BuildedTime = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][1]/span[@class='xiaoquInfoContent']/text()"
        ).extract_first()
        BuildingType = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][2]/span[@class='xiaoquInfoContent']/text()"
        ).extract_first()
        PropertyFee = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][3]/span[@class='xiaoquInfoContent']/text()"
        ).extract_first()
        PropertyCompany = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][4]/span[@class='xiaoquInfoContent']/text()"
        ).extract_first()
        Developers = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][5]/span[@class='xiaoquInfoContent']/text()"
        ).extract_first()
        TotalBuilding = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][6]/span[@class='xiaoquInfoContent']/text()"
        ).extract_first()
        TotalHouseholds = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][7]/span[@class='xiaoquInfoContent']/text()"
        ).extract_first()
        NearbyStores = detail_community.xpath(
            "./div[@class='xiaoquInfoItem'][8]/span[@class='xiaoquInfoContent']"
        ).xpath("string(.)").extract_first()
        item1.fields["PropertyAddress"] = Field()
        item1["PropertyAddress"] = PropertyAddress
        item1.fields["PriceUnit"] = Field()
        item1["PriceUnit"] = PriceUnit

        item1.fields["BuildedTime"] = Field()
        item1["BuildedTime"] = BuildedTime
        item1.fields["BuildingType"] = Field()
        item1["BuildingType"] = BuildingType
        item1.fields["PropertyFee"] = Field()
        item1["PropertyFee"] = PropertyFee
        item1.fields["PropertyCompany"] = Field()
        item1["PropertyCompany"] = PropertyCompany
        item1.fields["Developers"] = Field()
        item1["Developers"] = Developers
        item1.fields["TotalBuilding"] = Field()
        item1["TotalBuilding"] = TotalBuilding
        item1.fields["TotalHouseholds"] = Field()
        item1["TotalHouseholds"] = TotalHouseholds
        item1.fields["NearbyStores"] = Field()
        item1["NearbyStores"] = NearbyStores
        item1.update(item)
        yield item1
Beispiel #20
0
 def setUp(self):
     self.url = 'http://localhost'
     self.kwargs = {'url': self.url, 'dont_filter': True}
     self.crawler = MagicMock()
     self.spider = MetaSpider.from_crawler(self.crawler)
     self.crawler.spider = self.spider
     self.crawl_manager = self.create_crawl_manager()
     self.crawl_manager.crawler = self.crawler
     self.item = Item()
     self.response = Response('http://localhost')
     self.another_spider = MetaSpider.from_crawler(self.crawler)
Beispiel #21
0
def describe_item_result_handling():
    single_item = Item()
    mixed_items = [ Item(), MyItem() ]
    complete_mix = mixed_items + [ Request("http://test.com") ]

    def it_can_extract_item_objects():
        assert items_in_parse_result([single_item]) == [single_item]

    def it_tolerates_single_elements():
        assert items_in_parse_result(single_item) == [single_item]
        assert requests_in_parse_result(single_item) == []

    def it_tolerates_items_mixed_in_between():
        assert items_in_parse_result(complete_mix) == mixed_items

    def it_tolerates_different_item_types():
        assert items_in_parse_result(mixed_items) == mixed_items

    def it_counts_the_items_and_other_results():
        assert count_requests_in_parse_result(complete_mix) == 1
        assert count_items_in_parse_result(complete_mix) == 2
Beispiel #22
0
 def handle_1(self, response):
     item1 = response.meta.get("item1")
     req = response.meta.get("req")
     fragments = re.findall("url \+= '(.*?)'", response.text, re.S)
     detail_url = ''
     for j in fragments:
         detail_url += j
     item2 = Item()
     item2.fields["NewUrl"] = Field()
     item2["NewUrl"] = detail_url
     item2.update(item1)
     yield scrapy.Request(url=detail_url, callback=self.parse, meta={"req": req, "item2": item2, "last_page": True})
Beispiel #23
0
    def handle_1(self, response):
        item = response.meta.get("item")
        item1 = Item()
        PropertyAddress = response.xpath(
            "//div[contains(@class,'rent-top')]/a/text()").extract_first()
        PriceUnit = response.xpath(
            "//div[contains(@class,'junjia')]/span/text()").extract_first()
        ls_detail = response.xpath(
            "//div[@class='xqfangs detail_bor_bottom']/ul[@class='clear']/li/text()"
        ).extract()
        BuildedTime = None
        BuildingType = None
        for detail in ls_detail:
            if "年" in detail:
                BuildedTime = detail
            else:
                BuildingType = detail
        PropertyCompany = response.xpath(
            "//ul/li[@class='wuyes']/em/text()").extract_first()
        Developers = response.xpath(
            "//ul/li[@class='kaifas']/em/text()").extract_first()
        TotalBuilding = response.xpath(
            "//div[@class='xqsaleinfo']/ul/li[1]/span/text()").extract_first()
        TotalHouseholds = response.xpath(
            "//div[@class='xqsaleinfo']/ul/li[2]/span/text()").extract_first()
        NearbyStores = response.xpath(
            "//div[@class='xqsaleinfo']/ul/li[6]/span/text()").extract_first()
        AroundTraffic = response.xpath(
            "//div[@class='xqsaleinfo']/ul/li[5]/span/text()").extract_first()
        item1.fields["PropertyAddress"] = Field()
        item1["PropertyAddress"] = PropertyAddress
        item1.fields["PriceUnit"] = Field()
        item1["PriceUnit"] = PriceUnit

        item1.fields["BuildedTime"] = Field()
        item1["BuildedTime"] = BuildedTime
        item1.fields["BuildingType"] = Field()
        item1["BuildingType"] = BuildingType
        item1.fields["PropertyCompany"] = Field()
        item1["PropertyCompany"] = PropertyCompany
        item1.fields["Developers"] = Field()
        item1["Developers"] = Developers
        item1.fields["TotalBuilding"] = Field()
        item1["TotalBuilding"] = TotalBuilding
        item1.fields["TotalHouseholds"] = Field()
        item1["TotalHouseholds"] = TotalHouseholds
        item1.fields["NearbyStores"] = Field()
        item1["NearbyStores"] = NearbyStores
        item1.fields["AroundTraffic"] = Field()
        item1["AroundTraffic"] = AroundTraffic
        item1.update(item)
        yield item1
Beispiel #24
0
 def start_requests(self):
     for kw in self.kw_list:
         start_page = 1
         for page in range(start_page, 11):
             item = Item()
             item.fields["SearchWord"] = Field()
             item.fields["Page"] = Field()
             item["SearchWord"] = kw
             item["Page"] = page
             start_url = 'https://weixin.sogou.com/weixin?query={}&type=2&page={}&ie=utf8'.format(
                 parse.quote(kw), str(page))
             yield scrapy.Request(url=start_url, callback=self.parse,
                                  meta={"start_url": start_url, "item": item})
Beispiel #25
0
 def page_handle(self, response):
     region = response.meta.get("region")
     plate = response.meta.get("plate")
     plate_url = response.meta.get("plate_url")
     try:
         housing_num_flag = response.xpath(
             "//h2[contains(@class,'total')]/span/text()").extract_first(
             ).strip()
     except:
         housing_num_flag = '0'
     if housing_num_flag != "0":
         housing_list = response.xpath(
             "//div[@class='leftContent']/ul[@class='sellListContent']/li[contains(@class, 'clear')]/div[@class='info clear']"
         )
         for housing in housing_list:
             # 列表页面字段获取
             item = Item()
             item.fields["AreaName"] = Field()
             item["AreaName"] = region
             item.fields["PlateName"] = Field()
             item["PlateName"] = plate
             housing_url = housing.xpath(
                 "./div[@class='title']/a/@href").extract_first()
             HouseDesc = housing.xpath(
                 "./div[@class='title']/a/text()").extract_first()
             item.fields["HouseDesc"] = Field()
             item["HouseDesc"] = HouseDesc
             item.fields["HouseUrl"] = Field()
             item["HouseUrl"] = housing_url
             yield scrapy.Request(url=housing_url,
                                  callback=self.housing_handle,
                                  meta={"item": deepcopy(item)},
                                  headers=self.get_headers())
         page_dict_handle = response.xpath(
             "//div[contains(@class, 'page-box')]/@page-data"
         ).extract_first()
         page_dict = json.loads(page_dict_handle)
         total_page = page_dict.get("totalPage")
         current_page = page_dict.get("curPage")
         if current_page < total_page:
             next_page = plate_url + "pg" + str(current_page + 1) + "/"
             yield scrapy.Request(
                 url=next_page,
                 callback=self.page_handle,
                 meta={
                     "plate_url": plate_url,
                     'region': region,
                     "plate": plate
                 },
                 headers=self.get_headers(),
             )
Beispiel #26
0
    def parse(self, response):
        req = response.meta.get("req")
        item = response.meta.get("item")
        url_list_handle = Selector(text=response.text)

        url_list = url_list_handle.xpath("//div[@class='txt-box']/h3/a/@href").extract()
        for index, url in enumerate(url_list):
            item1 = Item()
            item1.fields["Located"] = Field()
            item1["Located"] = index
            url = self.base_url + url
            url = self.get_real_url_handle(url)
            item1.update(item)
            yield scrapy.Request(url=url, callback=self.handle_1, meta={"req": req, "item1": item1})
Beispiel #27
0
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
    assert hs_spider_middleware._seen_requests == WeakKeyDictionary()
    assert hs_downloader_middleware._seen_requests == WeakKeyDictionary()
    assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests

    spider = Spider('test')
    url = 'http://resp-url'
    request_0 = Request(url)
    response_0 = Response(url)

    hs_downloader_middleware.process_request(request_0, spider)

    assert HS_REQUEST_ID_KEY not in request_0.meta
    assert HS_PARENT_ID_KEY not in request_0.meta
    assert len(hs_spider_middleware._seen_requests) == 0
    assert len(hs_downloader_middleware._seen_requests) == 0

    hs_downloader_middleware.process_response(request_0, response_0, spider)

    assert request_0.meta[HS_REQUEST_ID_KEY] == 0
    assert request_0.meta[HS_PARENT_ID_KEY] is None
    assert hs_spider_middleware._seen_requests[request_0] == 0

    response_0.request = request_0
    request_1 = Request(url)
    request_2 = Request(url)
    item1 = {}
    item2 = Item()
    output = [request_1, request_2, item1, item2]
    processed_output = list(
        hs_spider_middleware.process_spider_output(response_0, output, spider))

    assert processed_output[0] is request_1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[1] is request_2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[2] is item1
    assert processed_output[3] is item2

    response_1 = Response(url)
    hs_downloader_middleware.process_request(request_1, spider)
    hs_downloader_middleware.process_response(request_1, response_1, spider)
    assert request_1.meta[HS_REQUEST_ID_KEY] == 1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0

    response_2 = Response(url)
    hs_downloader_middleware.process_request(request_2, spider)
    hs_downloader_middleware.process_response(request_2, response_2, spider)
    assert request_2.meta[HS_REQUEST_ID_KEY] == 2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
 def start_requests(self):
     item = Item()
     for url in self.start_urls:
         url_slit = url.split(":")
         site_name = url_slit[0]
         url = ":".join(url_slit[1:-1])
         item.fields["site_name"] = Field()
         item['site_name'] = site_name
         item.fields["source_url"] = Field()
         item['source_url'] = url_slit[-1]
         yield scrapy.Request(url=url,
                              meta={"item": item},
                              dont_filter=True,
                              callback=self.parse_url)
Beispiel #29
0
 def handle_2(self, response):
     item2 = response.meta.get("item2")
     item3 = Item()
     item3.fields["Title"] = Field()
     item3.fields["Content"] = Field()
     res_text = response.text
     res3_handle = Selector(text=res_text)
     # title
     title = res3_handle.xpath('//meta[@property="og:title"]/@content').extract_first()
     try:
         content = res3_handle.xpath("//div[@id='js_content']").xpath("string(.)").extract_first()
         content = content.strip()
     except Exception as e:
         content = None
     item3["Title"] = title
     item3["Content"] = content
     item3.update(item2)
     yield item3
Beispiel #30
0
    def second_parse(self, response):
        obj = json.loads(response.text)
        data = self.get_home_data(obj)
        for index, url in enumerate(data['house_url']):
            # TODO 新增判断网页重复
            item = Item()
            item.fields["AreaName"] = Field()
            item.fields["PlateName"] = Field()
            item.fields["HouseUrl"] = Field()
            item.fields["BuildedTime"] = Field()
            item.fields["TimeToRelease"] = Field()
            item.fields["PropertyAddress"] = Field()
            # item.fields["Floor"] = Field()
            item.fields["TotalPrice"] = Field()
            item.fields["BuildingSquare"] = Field()
            item.fields["PropertyCommunity"] = Field()
            item.fields["HasElevator"] = Field()

            item['AreaName'] = data['area'][index]
            item['PlateName'] = data['road'][index]
            flag = data['house_url']
            if flag:
                house_url = flag[index]
                if house_url:
                    item['HouseUrl'] = self.base_url + house_url
                    item['BuildedTime'] = data['build_time'][index]
                    item['TimeToRelease'] = time.strftime(
                        "%Y-%m-%d",
                        time.localtime(
                            int(str(data['release_time'][index])[:-3])))
                    item['PropertyAddress'] = data['addr'][index]
                    # item['Floor'] = data['floor'][index]
                    item['TotalPrice'] = data['total_price'][index]
                    item['BuildingSquare'] = data['build_size'][index]
                    item['PropertyCommunity'] = data['community'][index]
                    item['HasElevator'] = data['elevator'][index]
                    try:
                        next_url = "https://www.dafangya.com" + url
                    except:
                        continue
                    yield scrapy.Request(url=next_url,
                                         callback=self.third_parse,
                                         meta={'item': item},
                                         dont_filter=True)