Esempio n. 1
0
 def get_city(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" + response.request.url)
         yield response.request
     if response.m_response.content == "":
         request = Request(
                 url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
                     response.request.meta["province_id"] + "&",
                 callback="get_all_page", priority=1)
         request.meta["city_name"] = ""
         request.meta["city_id"] = ""
         request.meta["province_name"] = response.request.meta["province_name"]
         request.meta["province_id"] = response.request.meta["province_id"]
         yield request
     else:
         soup = bs(response.m_response.content, "lxml")
         city_list = soup.select("a")
         for city in city_list:
             city_name = city.string.strip()
             city_id = city["data-value"].strip()
             request = Request(
                     url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
                         response.request.meta["province_id"] + "&city=" + city_id + "&",
                     callback="get_all_page", priority=1)
             request.meta["city_name"] = city_name
             request.meta["city_id"] = city_id
             request.meta["province_name"] = response.request.meta["province_name"]
             request.meta["province_id"] = response.request.meta["province_id"]
             yield request
Esempio n. 2
0
    def process_page(self, response):
        soup = bs(response.m_response.content, 'lxml')

        zhu_div_list = soup.select('div.zxleft ul li')
        for zhu_div in zhu_div_list:
            detail_url = zhu_div.select('a')[0]['href']
            img_url = zhu_div.select('a img')[0]['src']
            title = zhu_div.select('a img')[0]['alt'].strip()
            shortDes = zhu_div.select('p.zxleft32 a')[0].text

            md5 = hashlib.md5()
            rand_name = str(time.time()) + str(random.random())
            md5.update(rand_name.encode(encoding='utf-8'))
            img_name = md5.hexdigest() + '.jpg'

            request = Request(url=img_url,
                              priority=1,
                              callback=self.process_pic)
            request.meta['img_name'] = img_name
            yield request

            request = Request(url=detail_url,
                              priority=1,
                              callback=self.process_detail)
            request.meta['title'] = title
            request.meta['shortDes'] = shortDes
            request.meta['img_name'] = img_name
            yield request
Esempio n. 3
0
 def process_page_3(self, response):
     soup = bs(response.m_response.content, 'lxml')
     car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
     for car_info in car_info_list:
         url = 'http://www.che168.com' + car_info['href']
         request = Request(url=url,
                           priority=4,
                           callback=self.process_page_4)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = response.request.meta['cars_line']
         yield request
     next_page = soup.find(
         lambda tag: tag.name == 'a' and '下一页' in tag.text)
     if next_page:
         url = 'http://www.che168.com' + next_page['href']
         request = Request(url=url,
                           priority=3,
                           callback=self.process_page_3)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = response.request.meta['cars_line']
         yield request
Esempio n. 4
0
    def get_all_page(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" + response.request.url)
            yield response.request
        else:
            soup = bs(response.m_response.content, "lxml")
            try:
                temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findNextSibling()
                if temp_page:
                    page = temp_page.select_one("a")
                    if page:
                        total_page = int(page.string.strip().replace("...", ""))
                    else:
                        total_page = 1
                else:
                    temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findPreviousSibling()
                    if temp_page:
                        page = temp_page.select_one("a")
                        if page:
                            total_page = int(page.string.strip().replace("...", ""))
                        else:
                            total_page = 1
                    else:
                        total_page = 1
            except:
                total_page = 1

            now_page = 1
            while now_page <= total_page:
                if response.request.meta["city_id"] == "":
                    request = Request(
                            url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
                                response.request.meta["province_id"] + "&p=" + str(now_page) + "&",
                            callback="get_content", priority=2)
                    request.meta["city_name"] = response.request.meta["city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta["province_name"]
                    request.meta["province_id"] = response.request.meta["province_id"]
                    yield request
                else:
                    request = Request(
                            url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p=" + str(
                                    now_page) + "&province=" +
                                response.request.meta["province_id"] + "&city=" + response.request.meta[
                                    "city_id"] + "&",
                            callback="get_content", priority=2)
                    request.meta["city_name"] = response.request.meta["city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta["province_name"]
                    request.meta["province_id"] = response.request.meta["province_id"]
                    yield request
                now_page += 1
Esempio n. 5
0
 def process(self, response):
     city_crawl_list = {u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'}
     soup = bs('''<a href="http://shop1.fang.com/" style="width:40px;padding:4px 0 4px 8px;">北京</a>
                  <a href="http://shop.sh.fang.com/" style="width:40px;padding:4px 0 4px 8px;">上海</a>
                  <a href="http://shop.gz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">广州</a>
                  <a href="http://shop.sz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">深圳</a>
                  <a href="http://shop.tj.fang.com/" style="width:40px;padding:4px 0 4px 8px;">天津</a>
                  <a href="http://shop.cq.fang.com/" style="width:40px;padding:4px 0 4px 8px;">重庆</a>
                  <a href="http://shop.cd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">成都</a>
                  <a href="http://shop.suzhou.fang.com/" style="width:40px;padding:4px 0 4px 8px;">苏州</a>
                  <a href="http://shop.wuhan.fang.com/" style="width:40px;padding:4px 0 4px 8px;">武汉</a>
                  <a href="http://shop.xian.fang.com/" style="width:40px;padding:4px 0 4px 8px;">西安</a>
                  <a href="http://shop.dg.fang.com/" style="width:40px;padding:4px 0 4px 8px;">东莞</a>
                  <a href="http://shop.km.fang.com/" style="width:40px;padding:4px 0 4px 8px;">昆明</a>
                  <a href="http://shop.hz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">杭州</a>
                  <a href="http://shop.jn.fang.com/" style="width:40px;padding:4px 0 4px 8px;">济南</a>
                  <a href="http://shop.wuxi.fang.com/" style="width:40px;padding:4px 0 4px 8px;">无锡</a>
                  <a href="http://shop.zz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">郑州</a>
                  <a href="http://shop.nc.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南昌</a>
                  <a href="http://shop.qd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">青岛</a>
                  <a href="http://shop.sjz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">石家庄</a>
                  <a href="http://shop.nanjing.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南京</a>
                  <a href="http://shop.dl.fang.com/" style="width:40px;padding:4px 0 4px 8px;">大连</a>''', 'lxml')
     city__list = soup.select('a')
     for city in city__list:
         city_name = city.text
         if city_name in city_crawl_list:
             url = city['href']
             request = Request(url=url, priority=1, callback=self.process_page_1)
             request.meta['city'] = city_name
             yield request
Esempio n. 6
0
class FirstProcessor(BaseProcessor):
    spider_id = 'test'
    spider_name = 'test'
    allowed_domains = ['mzitu.com']
    start_requests = [Request(url="http://www.mzitu.com/")]

    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        a_list = soup.select("a")
        for a in a_list:
            if "href" in a.attrs:
                url = response.nice_join(a["href"])
                if response.is_url(url):
                    yield Request(url=url, callback=self.procces2)

    def procces2(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            yield soup.title
            a_list = soup.select("a")
            for a in a_list:
                if "href" in a.attrs:
                    url = response.nice_join(a["href"])
                    if response.is_url(url):
                        yield Request(url=url, callback=self.procces2)
        else:
            print response.request.url
Esempio n. 7
0
 def process_page_1(self, response):
     if '下暂无网点信息' not in response.m_response.content:
         soup = bs(response.m_response.content, 'lxml')
         results = soup.select('ul.catalist li')
         for result in results:
             result_name = result.select("div.infoschema h3 a")[0].text
             result_mobile = result.find(
                 lambda tag: tag.name == 'p' and '电话:' in tag.text).text
             m_result = dict()
             m_result['result_name'] = result_name
             m_result['result_mobile'] = result_mobile.replace('电话:', '')
             m_result['city_name'] = response.request.meta['city_name']
             m_result['category1_name'] = response.request.meta[
                 'category1_name']
             m_result['category2_name'] = response.request.meta['city_name']
             yield m_result
         next_page = soup.find(
             lambda tag: tag.name == 'a' and '下一页' in tag.text)
         if next_page:
             url_splits = response.request.url.split('/')
             url_splits[-1] = next_page['href']
             url = '/'.join(url_splits)
             request = Request(url=url,
                               priority=1,
                               callback=self.process_page_1)
             request.meta['city_name'] = response.request.meta['city_name']
             request.meta['category1_name'] = response.request.meta[
                 'category1_name']
             request.meta['category2_name'] = response.request.meta[
                 'category2_name']
             yield request
Esempio n. 8
0
def request_from_dict(d, processor=None):
    """Create Request object from a dict.

    If a spider is given, it will try to resolve the callbacks looking at the
    spider for methods with the same name.
    """
    cb = d['callback']
    if cb and processor:
        cb = _get_method(processor, cb)
    eb = d['errback']
    if eb and processor:
        eb = _get_method(processor, eb)
    return Request(
        url=to_native_str(d['url']),
        data=d['data'],
        json=d['json'],
        allow_redirects=d['allow_redirects'],
        duplicate_remove=d['duplicate_remove'],
        timeout=d['timeout'],
        callback=cb,
        errback=eb,
        method=d['method'],
        headers=d['headers'],
        cookies=d['cookies'],
        meta=d['meta'],
        priority=d['priority'],
    )
Esempio n. 9
0
class MezituProcessor(BaseProcessor):
    spider_id = 'mzitu'
    spider_name = 'mzitu'
    allowed_domains = ['mzitu.com', 'meizitu.net']
    start_requests = [Request(url='http://www.mzitu.com/xinggan/')]

    rules = (
        Rule(LinkExtractor(
            regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),
             callback="save",
             priority=3),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"),
             priority=2),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"),
             priority=0),
    )

    def save(self, response):
        if response.m_response:
            if not os.path.exists("img"):
                os.mkdir("img")
            with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
                fs.write(response.m_response.content)
                print("download success!")
Esempio n. 10
0
 def get_pic(self, response):
     if response.m_response:
         li_soup = bs(response.m_response.content, "lxml")
         if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) is not None:
             total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \
                              .find_previous_sibling().text)
             for page in range(1, total_page + 1):
                 yield Request(url=response.request.url + "/" + str(page), callback=self.download_pic, priority=2)
Esempio n. 11
0
 def process(self, response):
     soup = bs(response.m_response.content, 'lxml')
     a_list = soup.select("a")
     for a in a_list:
         if "href" in a.attrs:
             url = response.nice_join(a["href"])
             if response.is_url(url):
                 yield Request(url=url, callback=self.procces2)
Esempio n. 12
0
 def get_page_content(self, response):
     if response.m_response:
         soup = bs(response.m_response.content, 'lxml')
         li_list = soup.select("div.postlist ul#pins li")
         for li in li_list:
             yield Request(url=li.select_one("a").attrs["href"],
                           callback=self.get_pic,
                           priority=1)
Esempio n. 13
0
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            estate = detail.select('p.mt15 span.spName')[0].text
            detail_str = detail.select('p.mt10')[0].text

            temp_list = detail.select('p.mt10')[0].text.split('/')
            temp_list = [temp.strip() for temp in temp_list]

            if '购物中心/百货' not in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '')
                floor = temp_list[1]
                total_floor = temp_list[2].replace('层', '')
            elif '购物中心/百货' not in detail_str and '层' not in detail_str:
                m_type = temp_list[0].strip().replace('类型:', '')
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' not in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = temp_list[2]
                total_floor = temp_list[3].replace('层', '')
            else:
                logger.error('unexpective detail_str: ' + detail_str.strip())

            area = detail.select('div.area')[0].text.replace('㎡', '').replace(
                '建筑面积', '')
            total_price = detail.select(
                'div.moreInfo p.mt5 span.price')[0].text
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['estate'] = estate
            item['floor'] = floor
            item['total_floor'] = total_floor
            item['type'] = m_type
            item['area'] = area
            item['total_price'] = total_price
            item['crawl_date'] = crawl_date

            item['city'] = response.request.meta['city']
            item['district'] = response.request.meta['district']
            item['url'] = response.request.url
            yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href']) + '/'
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Esempio n. 14
0
 def process(self, response):
     if response.m_response:
         soup = bs(response.m_response.content, "lxml")
         total_page = int(
             soup.select_one(
                 "a.next.page-numbers").find_previous_sibling().text)
         for page in range(1, total_page + 1):
             yield Request(url="http://www.mzitu.com/xinggan/page/" +
                           str(page),
                           callback=self.get_page_content)
Esempio n. 15
0
 def process_page_1(self, response):
     soup = bs(response.m_response.content, 'lxml')
     district_list = soup.select('div.qxName a')
     district_list.pop(0)
     for district in district_list:
         district_name = district.text
         url = response.request.url + district['href']
         request = Request(url=url, priority=2, callback=self.process_page_2)
         request.meta['city'] = response.request.meta['city']
         request.meta['district'] = district_name
         yield request
Esempio n. 16
0
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')

        page_list = soup.select('div.zxpage a')
        total_page = int(page_list[page_list.__len__() - 2].text)
        page = 1
        while page <= total_page:
            yield Request(url='http://www.zhuwang.cc/list-58-%d.html' % page,
                          callback=self.process_page,
                          priority=0,
                          duplicate_remove=False)
            page += 1
Esempio n. 17
0
class FirstProcessor(BaseProcessor):
    spider_id = 'test'
    spider_name = 'test'
    allowed_domains = ['mzitu.com']
    start_requests = [Request(url="http://www.mzitu.com/")]

    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        a_list = soup.select("a")
        for a in a_list:
            if "href" in a.attrs:
                url = response.nice_join(a["href"])
                yield {'url': url}
Esempio n. 18
0
 def process_page_1(self, response):
     brand_list = list(
         json.loads(response.m_response.content.decode('gb2312')))
     for brand in brand_list:
         brand_dict = dict(brand)
         brand_name = brand_dict['name']
         url = response.nice_join(brand_dict['url']) + '/'
         request = Request(url=url,
                           priority=2,
                           callback=self.process_page_2)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = brand_name
         yield request
Esempio n. 19
0
class FeProcessor(BaseProcessor):
    spider_id = 'fe'
    spider_name = 'fe'
    allowed_domains = ['58.com']
    start_requests = [Request(url='http://www.58.com/daikuan/changecity/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"), priority=0),
        Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1),
        Rule(LinkExtractor(css_str="table.small-tbimg a.t"), priority=3, callback='save'),
    )

    def save(self, response):
        if response.m_response:
            print bs(response.m_response.content, 'lxml').title.string
Esempio n. 20
0
 def process_page_2(self, response):
     soup = bs(response.m_response.content, 'lxml')
     cars_line_list = soup.select(
         'div#series div.content-area dl.model-list dd a')
     for cars_line in cars_line_list:
         cars_line_name = cars_line.text
         url = 'http://www.che168.com' + cars_line['href']
         request = Request(url=url,
                           priority=3,
                           callback=self.process_page_3)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = cars_line_name
         yield request
Esempio n. 21
0
class MezituProcessor(BaseProcessor):
    spider_id = 'mzitu'
    spider_name = 'mzitu'
    allowed_domains = ['mzitu.com', 'meizitu.net']
    start_requests = [Request(url='http://www.mzitu.com/xinggan')]

    def process(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, "lxml")
            total_page = int(
                soup.select_one(
                    "a.next.page-numbers").find_previous_sibling().text)
            for page in range(1, total_page + 1):
                yield Request(url="http://www.mzitu.com/xinggan/page/" +
                              str(page),
                              callback=self.get_page_content)

    def get_page_content(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            li_list = soup.select("div.postlist ul#pins li")
            for li in li_list:
                yield Request(url=li.select_one("a").attrs["href"],
                              callback=self.get_pic,
                              priority=1)

    def get_pic(self, response):
        if response.m_response:
            li_soup = bs(response.m_response.content, "lxml")
            if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text
                            ) is not None:
                total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \
                                 .find_previous_sibling().text)
                for page in range(1, total_page + 1):
                    yield Request(url=response.request.url + "/" + str(page),
                                  callback=self.download_pic,
                                  priority=2)

    def download_pic(self, response):
        if response.m_response:
            href = bs(response.m_response.content,
                      "lxml").select_one("div.main-image img").attrs["src"]
            yield Request(url=href, callback=self.download, priority=3)

    def download(self, response):
        if response.m_response:
            if response.m_response.status_code == 200:
                yield response.m_response.content
Esempio n. 22
0
 def process(self, response):
     soup = bs(response.m_response.content, 'lxml')
     category1 = soup.select('div.navlink')
     for category in category1:
         category1_name = category.select('div.title h2')[0].text
         category_2 = category.select('ul.topic li a')
         for category_2_one in category_2:
             url = response.nice_join(category_2_one['href']) + '/'
             category_2_name = category_2_one.text
             request = Request(url=url,
                               priority=1,
                               callback=self.process_page_1)
             request.meta['city_name'] = response.request.meta['city_name']
             request.meta['category1_name'] = category1_name
             request.meta['category2_name'] = category_2_name
             yield request
Esempio n. 23
0
 def process(self, response):
     if hasattr(self, 'rules'):
         rules = getattr(self, 'rules', None)
     else:
         rules = ()
     for rule in rules:
         links = rule.link_extractor.extract_links(response)
         if links:
             for link in links:
                 request = Request(url=link,
                                   callback=rule.callback,
                                   priority=rule.priority)
                 request = rule.process_request(request)
                 yield request
                 if rule.only_first:
                     break
Esempio n. 24
0
 def process(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" + response.request.url)
         yield response.request
     soup = bs(response.m_response.content, "lxml")
     province_list = soup.select_one("dl#provinceOld").select("div.pull-left")[1].select("dd a")
     for province in province_list:
         province_name = province.string.strip()
         province_id = province["data-value"].strip()
         request = Request(
                 url="http://www.qichacha.com/search_getCityListHtml?province=" + province_id + "&q_type=1",
                 callback="get_city", priority=0)
         request.meta["province_name"] = province_name
         request.meta["province_id"] = province_id
         yield request
Esempio n. 25
0
 def process(self, response):
     soup = bs(response.m_response.content, 'lxml')
     province_list = {u'山西'}
     province_div_list = soup.select('div#c02 ul li')
     for province_div in province_div_list:
         province_name = province_div.select('strong')[0].text
         if province_name != '其他':
             if province_name in province_list:
                 city_list = province_div.select('a')
                 for city in city_list:
                     city_name = city.text
                     url = city['href']
                     request = Request(url=url,
                                       priority=1,
                                       callback=self.process_page_1)
                     request.meta['province'] = province_name
                     request.meta['city'] = city_name
                     yield request
Esempio n. 26
0
 def process(self, response):
     soup = bs(response.m_response.content, 'lxml')
     province_div_list = soup.select(
         'div.city-list div.cap-city > div.fn-clear')
     for province_div in province_div_list:
         province_name = province_div.select('span.capital a')[0].text
         city_list = province_div.select('div.city a')
         for city in city_list:
             city_name = city.text
             pinyin = city['href'].strip('/').split('/')[0]
             request = Request(
                 url=
                 'http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s'
                 % pinyin,
                 priority=1,
                 callback=self.process_page_1)
             request.meta['province'] = province_name
             request.meta['city'] = city_name
             yield request
Esempio n. 27
0
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        avg_price_list = soup.select('div.newcardR dl')
        if len(avg_price_list) > 0:
            avg_price = avg_price_list[1].select('dd b')[0].text
        else:
            avg_price = '未知'
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            if len(detail.select('p.mt10 a span')) != 0:
                estate = detail.select('p.mt10 a span')[0].text
                area = detail.select('div.area p')[0].text.replace('㎡', '')
                layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
                total_price = detail.select(
                    'div.moreInfo p.mt5 span.price')[0].text
                crawl_date = time.strftime('%Y-%m-%d',
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Esempio n. 28
0
class CityLocationProcessor(BaseProcessor):
    spider_id = 'city'
    spider_name = 'city'
    allowed_domains = ['supfree.net']
    start_requests = [Request(url='http://jingwei.supfree.net/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"kongzi\.asp\?id=\d+"), priority=0),
        Rule(LinkExtractor(regex_str=r"mengzi\.asp\?id=\d+"),
             priority=1,
             only_first=True,
             callback='save'),
    )

    def save(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            name = soup.select("div.cdiv p")[0].string.strip().split(' ')
            if len(name) > 2:
                province = name[0]
                city = name[1]
                area = name[2]
            elif len(name) > 1:
                province = name[0]
                city = name[0]
                area = name[1]
            else:
                province = name[0]
                city = name[0]
                area = name[0]
            lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip()
            la = soup.select("div.cdiv p")[1].select("span")[1].string.strip()
            data = province + ',' + city + ',' + area + ',' + lo + ',' + la
            print data
            with open('city.txt', 'a+') as fs:
                data = province + ',' + city + ',' + area + ',' + lo + ',' + la
                fs.write(data + '\n')
                print data
Esempio n. 29
0
 def download_pic(self, response):
     if response.m_response:
         href = bs(response.m_response.content, "lxml").select_one("div.main-image img").attrs["src"]
         yield Request(url=href, callback=self.download, priority=3)
Esempio n. 30
0
class Fang_Processor(BaseProcessor):
    spider_id = 'fang_spider'
    spider_name = 'fang_spider'
    allowed_domains = ['fang.com']
    start_requests = [
        Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx',
                priority=0)
    ]

    @checkResponse
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        province_list = {u'山西'}
        province_div_list = soup.select('div#c02 ul li')
        for province_div in province_div_list:
            province_name = province_div.select('strong')[0].text
            if province_name != '其他':
                if province_name in province_list:
                    city_list = province_div.select('a')
                    for city in city_list:
                        city_name = city.text
                        url = city['href']
                        request = Request(url=url,
                                          priority=1,
                                          callback=self.process_page_1)
                        request.meta['province'] = province_name
                        request.meta['city'] = city_name
                        yield request

    @checkResponse
    def process_page_1(self, response):
        soup = bs(response.m_response.content, 'lxml')
        district_list = soup.select('div.qxName a')
        district_list.pop(0)
        for district in district_list:
            district_name = district.text
            url = response.request.url + district['href']
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = district_name
            yield request

    @checkResponse
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        avg_price_list = soup.select('div.newcardR dl')
        if len(avg_price_list) > 0:
            avg_price = avg_price_list[1].select('dd b')[0].text
        else:
            avg_price = '未知'
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            if len(detail.select('p.mt10 a span')) != 0:
                estate = detail.select('p.mt10 a span')[0].text
                area = detail.select('div.area p')[0].text.replace('㎡', '')
                layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
                total_price = detail.select(
                    'div.moreInfo p.mt5 span.price')[0].text
                crawl_date = time.strftime('%Y-%m-%d',
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request