Esempio n. 1
0
            detail_list = soup.select('div.details li')
            if len(detail_list) == 0:
                soup = bs(response.m_response.content, 'html5lib')
                detail_list = soup.select('div.details li')
            mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
            first_borad_date = detail_list[1].select('span')[0].text
            gear = detail_list[2].select('span')[0].text.split('/')[0]
            displacement = detail_list[2].select('span')[0].text.split('/')[1]
            price = soup.select('div.car-price ins')[0].text.replace('¥', '')
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['car'] = car
            item['mileage'] = mileage
            item['first_borad_date'] = first_borad_date
            item['gear'] = gear
            item['displacement'] = displacement
            item['price'] = price
            item['crawl_date'] = crawl_date

            item['province'] = response.request.meta['province']
            item['city'] = response.request.meta['city']
            item['brand'] = response.request.meta['brand']
            item['cars_line'] = response.request.meta['cars_line']
            yield item


if __name__ == '__main__':
    spider = SpiderCore(Car_Processor()).set_pipeline(
        ConsolePipeline()).set_pipeline(TextPipelineCar()).start()
Esempio n. 2
0
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request


if __name__ == '__main__':
    spider = SpiderCore(Fang_Processor(),
                        test=True).set_pipeline(ConsolePipeline()).start()
Esempio n. 3
0
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request


if __name__ == '__main__':
    spider = SpiderCore(Fang_Processor()).set_pipeline(
        ConsolePipeline()).set_pipeline(TextPipelineFang()).start()
Esempio n. 4
0
                              callback=self.process_detail)
            request.meta['title'] = title
            request.meta['shortDes'] = shortDes
            request.meta['img_name'] = img_name
            yield request

    @checkResponse
    def process_pic(self, response):
        result = response.m_response.content
        yield pipeItem(['save'], result)

    @checkResponse
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')

        dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace(
            '来源: ', '').replace('来源:', '').split(' ')
        date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace(
            '|', '')
        newsFrom = dd_tail[0].strip()

        result = dict()
        result['date_time'] = date_time
        result['newsFrom'] = newsFrom

        yield pipeItem(['console', 'test'], result)


if __name__ == '__main__':
    SpiderCore(TEST_Processor()).set_pipeline(ConsolePipeline(),'console')\
        .set_pipeline(PicPipeline(),'save').set_pipeline(TestPipeline(),'test').start()
Esempio n. 5
0
                result_mobile = result.find(
                    lambda tag: tag.name == 'p' and '电话:' in tag.text).text
                m_result = dict()
                m_result['result_name'] = result_name
                m_result['result_mobile'] = result_mobile.replace('电话:', '')
                m_result['city_name'] = response.request.meta['city_name']
                m_result['category1_name'] = response.request.meta[
                    'category1_name']
                m_result['category2_name'] = response.request.meta['city_name']
                yield m_result
            next_page = soup.find(
                lambda tag: tag.name == 'a' and '下一页' in tag.text)
            if next_page:
                url_splits = response.request.url.split('/')
                url_splits[-1] = next_page['href']
                url = '/'.join(url_splits)
                request = Request(url=url,
                                  priority=1,
                                  callback=self.process_page_1)
                request.meta['city_name'] = response.request.meta['city_name']
                request.meta['category1_name'] = response.request.meta[
                    'category1_name']
                request.meta['category2_name'] = response.request.meta[
                    'category2_name']
                yield request


if __name__ == '__main__':
    SpiderCore(Bendibao_Processor(), time_sleep=0.5).set_pipeline(
        TextPipelineBendibao()).set_pipeline(ConsolePipeline()).start()
Esempio n. 6
0
File: main.py Progetto: tuian/Sasila
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# import sys
# import os
#
# sys.path.append(os.path.dirname(os.getcwd()))
from car_processor import Car_Processor
from fang_processor import Fang_Processor
from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
from sasila.system_normal.spider.spider_core import SpiderCore
from sasila.system_normal.manager import manager
from sasila import system_web

spider_car = SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline())
spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline())
manager.set_spider(spider_car)
manager.set_spider(spider_fang)
system_web.start()
Esempio n. 7
0
 def test_car_processor(self):
     test_pipeline = TestPipeline()
     SpiderCore(Car_Processor(), test=True).set_pipeline(
         ConsolePipeline()).set_pipeline(test_pipeline).start()
     self.assertEqual(len(test_pipeline.result), 11, '爬取结果,11个字段')
Esempio n. 8
0
 def test_car_processor(self):
     test_pipeline = TestPipeline()
     SpiderCore(TEST_Processor(),test=True).set_pipeline(ConsolePipeline(),'console').set_pipeline(PicPipeline(),'save')\
         .set_pipeline(test_pipeline,'test').start()
     self.assertIn('2017',test_pipeline.result['date_time'])
Esempio n. 9
0
 def test_car_processor(self):
     test_pipeline = TestPipeline()
     SpiderCore(Car_Processor(), test=True).set_pipeline(
         ConsolePipeline()).set_pipeline(test_pipeline).start()
     self.assertEqual(test_pipeline.result['province'], '上海', '爬取结果,省份为上海')