Beispiel #1
0
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            estate = detail.select('p.mt15 span.spName')[0].text
            detail_str = detail.select('p.mt10')[0].text

            temp_list = detail.select('p.mt10')[0].text.split('/')
            temp_list = [temp.strip() for temp in temp_list]

            if '购物中心/百货' not in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '')
                floor = temp_list[1]
                total_floor = temp_list[2].replace('层', '')
            elif '购物中心/百货' not in detail_str and '层' not in detail_str:
                m_type = temp_list[0].strip().replace('类型:', '')
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' not in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = temp_list[2]
                total_floor = temp_list[3].replace('层', '')
            else:
                logger.error('unexpective detail_str: ' + detail_str.strip())

            area = detail.select('div.area')[0].text.replace('㎡', '').replace(
                '建筑面积', '')
            total_price = detail.select(
                'div.moreInfo p.mt5 span.price')[0].text
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['estate'] = estate
            item['floor'] = floor
            item['total_floor'] = total_floor
            item['type'] = m_type
            item['area'] = area
            item['total_price'] = total_price
            item['crawl_date'] = crawl_date

            item['city'] = response.request.meta['city']
            item['district'] = response.request.meta['district']
            item['url'] = response.request.url
            yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href']) + '/'
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Beispiel #2
0
 def process_page_1(self, response):
     brand_list = list(json.loads(response.m_response.content.decode('gb2312')))
     for brand in brand_list:
         brand_dict = dict(brand)
         brand_name = brand_dict['name']
         url = response.nice_join(brand_dict['url']) + '/'
         request = Request(url=url, priority=2, callback=self.process_page_2)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = brand_name
         yield request
Beispiel #3
0
 def process_page_1(self, response):
     soup = bs(response.m_response.content, 'lxml')
     district_list = soup.select('div.qxName a')
     district_list.pop(0)
     for district in district_list:
         district_name = district.text
         url = response.request.url + district['href']
         request = Request(url=url, priority=2, callback=self.process_page_2)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['district'] = district_name
         yield request
Beispiel #4
0
 def process(self, response):
     soup = bs(response.m_response.content, 'lxml')
     province_div_list = soup.select('div.city-list div.cap-city > div.fn-clear')
     for province_div in province_div_list:
         province_name = province_div.select('span.capital a')[0].text
         city_list = province_div.select('div.city a')
         for city in city_list:
             city_name = city.text
             pinyin = city['href'].strip('/').split('/')[0]
             request = Request(
                     url='http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin,
                     priority=1, callback=self.process_page_1)
             request.meta['province'] = province_name
             request.meta['city'] = city_name
             yield request
Beispiel #5
0
def request_from_dict(d, processor=None):
    """Create Request object from a dict.

    If a spider is given, it will try to resolve the callbacks looking at the
    spider for methods with the same name.
    """
    cb = d['callback']
    if cb and processor:
        cb = _get_method(processor, cb)
    eb = d['errback']
    if eb and processor:
        eb = _get_method(processor, eb)
    return Request(
            url=to_native_str(d['url']),
            data=d['data'],
            json=d['json'],
            allow_redirects=d['allow_redirects'],
            duplicate_remove=d['duplicate_remove'],
            timeout=d['timeout'],
            callback=cb,
            errback=eb,
            method=d['method'],
            headers=d['headers'],
            cookies=d['cookies'],
            meta=d['meta'],
            priority=d['priority'], )
Beispiel #6
0
class FirstProcessor(BaseProcessor):
    spider_id = 'test'
    spider_name = 'test'
    allowed_domains = ['mzitu.com']
    start_requests = [Request(url="http://www.mzitu.com/")]

    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        a_list = soup.select("a")
        for a in a_list:
            if "href" in a.attrs:
                url = response.nice_join(a["href"])
                if response.is_url(url):
                    yield Request(url=url, callback=self.procces2)

    def procces2(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            yield soup.title
            a_list = soup.select("a")
            for a in a_list:
                if "href" in a.attrs:
                    url = response.nice_join(a["href"])
                    if response.is_url(url):
                        yield Request(url=url, callback=self.procces2)
        else:
            print response.request.url
Beispiel #7
0
 def process(self, response):
     soup = bs(response.m_response.content, 'lxml')
     province_list = {u'四川', u'江苏', u'江西', u'山东', u'广东', u'山西'}
     province_div_list = soup.select('div#c02 ul li')
     for province_div in province_div_list:
         province_name = province_div.select('strong')[0].text
         if province_name != '其他':
             if province_name in province_list:
                 city_list = province_div.select('a')
                 for city in city_list:
                     city_name = city.text
                     url = city['href']
                     request = Request(url=url, priority=1, callback=self.process_page_1)
                     request.meta['province'] = province_name
                     request.meta['city'] = city_name
                     yield request
Beispiel #8
0
 def process(self, response):
     soup = bs(response.m_response.content, 'lxml')
     a_list = soup.select("a")
     for a in a_list:
         if "href" in a.attrs:
             url = response.nice_join(a["href"])
             if response.is_url(url):
                 yield Request(url=url, callback=self.procces2)
Beispiel #9
0
 def get_page_content(self, response):
     if response.m_response:
         soup = bs(response.m_response.content, 'lxml')
         li_list = soup.select("div.postlist ul#pins li")
         for li in li_list:
             yield Request(url=li.select_one("a").attrs["href"],
                           callback=self.get_pic,
                           priority=1)
Beispiel #10
0
 def process(self, response):
     if response.m_response:
         soup = bs(response.m_response.content, "lxml")
         total_page = int(
             soup.select_one(
                 "a.next.page-numbers").find_previous_sibling().text)
         for page in range(1, total_page + 1):
             yield Request(url="http://www.mzitu.com/xinggan/page/" +
                           str(page),
                           callback=self.get_page_content)
Beispiel #11
0
 def get_pic(self, response):
     if response.m_response:
         li_soup = bs(response.m_response.content, "lxml")
         if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text
                         ) is not None:
             total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \
                              .find_previous_sibling().text)
             for page in range(1, total_page + 1):
                 yield Request(url=response.request.url + "/" + str(page),
                               callback=self.download_pic,
                               priority=2)
Beispiel #12
0
 def process(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" +
                      response.request.url)
         yield response.request
     soup = bs(response.m_response.content, "lxml")
     province_list = soup.select_one("dl#provinceOld").select(
         "div.pull-left")[1].select("dd a")
     for province in province_list:
         province_name = province.string.strip()
         province_id = province["data-value"].strip()
         request = Request(
             url="http://www.qichacha.com/search_getCityListHtml?province="
             + province_id + "&q_type=1",
             callback="get_city",
             priority=0)
         request.meta["province_name"] = province_name
         request.meta["province_id"] = province_id
         yield request
Beispiel #13
0
 def process(self, response):
     if hasattr(self, 'rules'):
         rules = getattr(self, 'rules', None)
     else:
         rules = ()
     for rule in rules:
         links = rule.link_extractor.extract_links(response)
         if links:
             for link in links:
                 request = Request(url=link, callback=rule.callback, priority=rule.priority)
                 request = rule.process_request(request)
                 yield request
                 if rule.only_first:
                     break
Beispiel #14
0
 def process(self, response):
     city_crawl_list = {
         u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'
     }
     soup = bs(
         '''<a href="http://shop1.fang.com/" style="width:40px;padding:4px 0 4px 8px;">北京</a>
                  <a href="http://shop.sh.fang.com/" style="width:40px;padding:4px 0 4px 8px;">上海</a>
                  <a href="http://shop.gz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">广州</a>
                  <a href="http://shop.sz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">深圳</a>
                  <a href="http://shop.tj.fang.com/" style="width:40px;padding:4px 0 4px 8px;">天津</a>
                  <a href="http://shop.cq.fang.com/" style="width:40px;padding:4px 0 4px 8px;">重庆</a>
                  <a href="http://shop.cd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">成都</a>
                  <a href="http://shop.suzhou.fang.com/" style="width:40px;padding:4px 0 4px 8px;">苏州</a>
                  <a href="http://shop.wuhan.fang.com/" style="width:40px;padding:4px 0 4px 8px;">武汉</a>
                  <a href="http://shop.xian.fang.com/" style="width:40px;padding:4px 0 4px 8px;">西安</a>
                  <a href="http://shop.dg.fang.com/" style="width:40px;padding:4px 0 4px 8px;">东莞</a>
                  <a href="http://shop.km.fang.com/" style="width:40px;padding:4px 0 4px 8px;">昆明</a>
                  <a href="http://shop.hz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">杭州</a>
                  <a href="http://shop.jn.fang.com/" style="width:40px;padding:4px 0 4px 8px;">济南</a>
                  <a href="http://shop.wuxi.fang.com/" style="width:40px;padding:4px 0 4px 8px;">无锡</a>
                  <a href="http://shop.zz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">郑州</a>
                  <a href="http://shop.nc.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南昌</a>
                  <a href="http://shop.qd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">青岛</a>
                  <a href="http://shop.sjz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">石家庄</a>
                  <a href="http://shop.nanjing.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南京</a>
                  <a href="http://shop.dl.fang.com/" style="width:40px;padding:4px 0 4px 8px;">大连</a>''',
         'lxml')
     city__list = soup.select('a')
     for city in city__list:
         city_name = city.text
         if city_name in city_crawl_list:
             url = city['href']
             request = Request(url=url,
                               priority=1,
                               callback=self.process_page_1)
             request.meta['city'] = city_name
             yield request
Beispiel #15
0
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        avg_price_list = soup.select('div.newcardR dl')
        if len(avg_price_list) > 0:
            avg_price = avg_price_list[1].select('dd b')[0].text
        else:
            avg_price = '未知'
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            if len(detail.select('p.mt10 a span')) != 0:
                estate = detail.select('p.mt10 a span')[0].text
                area = detail.select('div.area p')[0].text.replace('㎡', '')
                layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
                total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
                crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url, priority=2, callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Beispiel #16
0
class MezituProcessor(BaseProcessor):
    spider_id = 'mzitu'
    spider_name = 'mzitu'
    allowed_domains = ['mzitu.com', 'meizitu.net']
    start_requests = [Request(url='http://www.mzitu.com/xinggan')]

    def process(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, "lxml")
            total_page = int(
                soup.select_one(
                    "a.next.page-numbers").find_previous_sibling().text)
            for page in range(1, total_page + 1):
                yield Request(url="http://www.mzitu.com/xinggan/page/" +
                              str(page),
                              callback=self.get_page_content)

    def get_page_content(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            li_list = soup.select("div.postlist ul#pins li")
            for li in li_list:
                yield Request(url=li.select_one("a").attrs["href"],
                              callback=self.get_pic,
                              priority=1)

    def get_pic(self, response):
        if response.m_response:
            li_soup = bs(response.m_response.content, "lxml")
            if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text
                            ) is not None:
                total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \
                                 .find_previous_sibling().text)
                for page in range(1, total_page + 1):
                    yield Request(url=response.request.url + "/" + str(page),
                                  callback=self.download_pic,
                                  priority=2)

    def download_pic(self, response):
        if response.m_response:
            href = bs(response.m_response.content,
                      "lxml").select_one("div.main-image img").attrs["src"]
            yield Request(url=href, callback=self.download, priority=3)

    def download(self, response):
        if response.m_response:
            if response.m_response.status_code == 200:
                yield response.m_response.content
Beispiel #17
0
 def process_page_2(self, response):
     soup = bs(response.m_response.content, 'lxml')
     cars_line_list = soup.select('div#series div.content-area dl.model-list dd a')
     for cars_line in cars_line_list:
         cars_line_name = cars_line.text
         url = 'http://www.che168.com' + cars_line['href']
         request = Request(url=url, priority=3, callback=self.process_page_3)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = cars_line_name
         yield request
Beispiel #18
0
class FeProcessor(BaseProcessor):
    spider_id = 'fe'
    spider_name = 'fe'
    allowed_domains = ['58.com']
    start_requests = [Request(url='http://www.58.com/daikuan/changecity/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"),
             priority=0),
        Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1),
        Rule(LinkExtractor(css_str="table.small-tbimg a.t"),
             priority=3,
             callback='save'),
    )

    def save(self, response):
        if response.m_response:
            print bs(response.m_response.content, 'lxml').title.string
class MezituProcessor(BaseProcessor):
    spider_id = 'mzitu'
    spider_name = 'mzitu'
    allowed_domains = ['mzitu.com', 'meizitu.net']
    start_requests = [Request(url='http://www.mzitu.com/xinggan/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),
             callback="save", priority=3),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0),
    )

    def save(self, response):
        if response.m_response:
            if not os.path.exists("img"):
                os.mkdir("img")
            with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
                fs.write(response.m_response.content)
                print("download success!")
Beispiel #20
0
class CityLocationProcessor(BaseProcessor):
    spider_id = 'city'
    spider_name = 'city'
    allowed_domains = ['supfree.net']
    start_requests = [Request(url='http://jingwei.supfree.net/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"kongzi\.asp\?id=\d+"), priority=0),
        Rule(LinkExtractor(regex_str=r"mengzi\.asp\?id=\d+"),
             priority=1,
             only_first=True,
             callback='save'),
    )

    def save(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            name = soup.select("div.cdiv p")[0].string.strip().split(' ')
            if len(name) > 2:
                province = name[0]
                city = name[1]
                area = name[2]
            elif len(name) > 1:
                province = name[0]
                city = name[0]
                area = name[1]
            else:
                province = name[0]
                city = name[0]
                area = name[0]
            lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip()
            la = soup.select("div.cdiv p")[1].select("span")[1].string.strip()
            data = province + ',' + city + ',' + area + ',' + lo + ',' + la
            print data
            with open('city.txt', 'a+') as fs:
                data = province + ',' + city + ',' + area + ',' + lo + ',' + la
                fs.write(data + '\n')
                print data
Beispiel #21
0
    def get_all_page(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        else:
            soup = bs(response.m_response.content, "lxml")
            try:
                temp_page = soup.find(lambda tag: tag.name == 'a' and '>' ==
                                      tag.text).parent.findNextSibling()
                if temp_page:
                    page = temp_page.select_one("a")
                    if page:
                        total_page = int(page.string.strip().replace(
                            "...", ""))
                    else:
                        total_page = 1
                else:
                    temp_page = soup.find(
                        lambda tag: tag.name == 'a' and '>' == tag.text
                    ).parent.findPreviousSibling()
                    if temp_page:
                        page = temp_page.select_one("a")
                        if page:
                            total_page = int(page.string.strip().replace(
                                "...", ""))
                        else:
                            total_page = 1
                    else:
                        total_page = 1
            except:
                total_page = 1

            now_page = 1
            while now_page <= total_page:
                if response.request.meta["city_id"] == "":
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                        + response.request.meta["province_id"] + "&p=" +
                        str(now_page) + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                else:
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p="
                        + str(now_page) + "&province=" +
                        response.request.meta["province_id"] + "&city=" +
                        response.request.meta["city_id"] + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                now_page += 1
Beispiel #22
0
class Fang_Processor(BaseProcessor):
    spider_id = 'fang_spider'
    spider_name = 'fang_spider'
    allowed_domains = ['fang.com']
    start_requests = [Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0)]

    @checkResponse
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        province_list = {u'四川', u'江苏', u'江西', u'山东', u'广东', u'山西'}
        province_div_list = soup.select('div#c02 ul li')
        for province_div in province_div_list:
            province_name = province_div.select('strong')[0].text
            if province_name != '其他':
                if province_name in province_list:
                    city_list = province_div.select('a')
                    for city in city_list:
                        city_name = city.text
                        url = city['href']
                        request = Request(url=url, priority=1, callback=self.process_page_1)
                        request.meta['province'] = province_name
                        request.meta['city'] = city_name
                        yield request

    @checkResponse
    def process_page_1(self, response):
        soup = bs(response.m_response.content, 'lxml')
        district_list = soup.select('div.qxName a')
        district_list.pop(0)
        for district in district_list:
            district_name = district.text
            url = response.request.url + district['href']
            request = Request(url=url, priority=2, callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = district_name
            yield request

    @checkResponse
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        avg_price_list = soup.select('div.newcardR dl')
        if len(avg_price_list) > 0:
            avg_price = avg_price_list[1].select('dd b')[0].text
        else:
            avg_price = '未知'
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            if len(detail.select('p.mt10 a span')) != 0:
                estate = detail.select('p.mt10 a span')[0].text
                area = detail.select('div.area p')[0].text.replace('㎡', '')
                layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
                total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
                crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url, priority=2, callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Beispiel #23
0
 def get_city(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" +
                      response.request.url)
         yield response.request
     if response.m_response.content == "":
         request = Request(
             url=
             "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
             + response.request.meta["province_id"] + "&",
             callback="get_all_page",
             priority=1)
         request.meta["city_name"] = ""
         request.meta["city_id"] = ""
         request.meta["province_name"] = response.request.meta[
             "province_name"]
         request.meta["province_id"] = response.request.meta["province_id"]
         yield request
     else:
         soup = bs(response.m_response.content, "lxml")
         city_list = soup.select("a")
         for city in city_list:
             city_name = city.string.strip()
             city_id = city["data-value"].strip()
             request = Request(
                 url=
                 "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                 + response.request.meta["province_id"] + "&city=" +
                 city_id + "&",
                 callback="get_all_page",
                 priority=1)
             request.meta["city_name"] = city_name
             request.meta["city_id"] = city_id
             request.meta["province_name"] = response.request.meta[
                 "province_name"]
             request.meta["province_id"] = response.request.meta[
                 "province_id"]
             yield request
Beispiel #24
0
class QccProcessor(BaseProcessor):
    spider_id = 'qcc'
    spider_name = 'qcc'
    allowed_domains = ['qichacha.com']

    start_requests = [
        Request(
            url=
            'http://www.qichacha.com/search?key=%E5%B0%8F%E9%A2%9D%E8%B4%B7%E6%AC%BE'
        )
    ]

    def process(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        soup = bs(response.m_response.content, "lxml")
        province_list = soup.select_one("dl#provinceOld").select(
            "div.pull-left")[1].select("dd a")
        for province in province_list:
            province_name = province.string.strip()
            province_id = province["data-value"].strip()
            request = Request(
                url="http://www.qichacha.com/search_getCityListHtml?province="
                + province_id + "&q_type=1",
                callback="get_city",
                priority=0)
            request.meta["province_name"] = province_name
            request.meta["province_id"] = province_id
            yield request

    def get_city(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        if response.m_response.content == "":
            request = Request(
                url=
                "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                + response.request.meta["province_id"] + "&",
                callback="get_all_page",
                priority=1)
            request.meta["city_name"] = ""
            request.meta["city_id"] = ""
            request.meta["province_name"] = response.request.meta[
                "province_name"]
            request.meta["province_id"] = response.request.meta["province_id"]
            yield request
        else:
            soup = bs(response.m_response.content, "lxml")
            city_list = soup.select("a")
            for city in city_list:
                city_name = city.string.strip()
                city_id = city["data-value"].strip()
                request = Request(
                    url=
                    "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                    + response.request.meta["province_id"] + "&city=" +
                    city_id + "&",
                    callback="get_all_page",
                    priority=1)
                request.meta["city_name"] = city_name
                request.meta["city_id"] = city_id
                request.meta["province_name"] = response.request.meta[
                    "province_name"]
                request.meta["province_id"] = response.request.meta[
                    "province_id"]
                yield request

    def get_all_page(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        else:
            soup = bs(response.m_response.content, "lxml")
            try:
                temp_page = soup.find(lambda tag: tag.name == 'a' and '>' ==
                                      tag.text).parent.findNextSibling()
                if temp_page:
                    page = temp_page.select_one("a")
                    if page:
                        total_page = int(page.string.strip().replace(
                            "...", ""))
                    else:
                        total_page = 1
                else:
                    temp_page = soup.find(
                        lambda tag: tag.name == 'a' and '>' == tag.text
                    ).parent.findPreviousSibling()
                    if temp_page:
                        page = temp_page.select_one("a")
                        if page:
                            total_page = int(page.string.strip().replace(
                                "...", ""))
                        else:
                            total_page = 1
                    else:
                        total_page = 1
            except:
                total_page = 1

            now_page = 1
            while now_page <= total_page:
                if response.request.meta["city_id"] == "":
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                        + response.request.meta["province_id"] + "&p=" +
                        str(now_page) + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                else:
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p="
                        + str(now_page) + "&province=" +
                        response.request.meta["province_id"] + "&city=" +
                        response.request.meta["city_id"] + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                now_page += 1

    def get_content(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        soup = bs(response.m_response.content, "lxml")
        content_list = soup.select("table.m_srchList tbody tr")
        for content in content_list:
            try:
                result_item = dict()
                result_item["province"] = response.request.meta[
                    "province_name"]
                result_item["city"] = response.request.meta["city_name"]
                result_item["company_name"] = content.select(
                    "td")[1].text.split('\n')[0].strip()
                result_item["company_man"] = content.select(
                    "td")[1].text.split('\n')[1].strip().replace("企业法人:", "")
                result_item["company_telephone"] = content.select(
                    "td")[1].text.split('\n')[2].strip().replace("联系方式:", "")
                result_item["company_address"] = content.select(
                    "td")[1].text.split('\n')[3].strip()
                if "地址:" in result_item["company_address"]:
                    result_item["company_address"] = result_item[
                        "company_address"].replace("地址:", "")
                else:
                    result_item["company_address"] = ""
                result_item["company_registered_capital"] = content.select(
                    "td")[2].text.strip()
                result_item["company_registered_time"] = content.select(
                    "td")[3].text.strip()
                result_item["company_status"] = content.select(
                    "td")[4].text.strip()
                result_item["source"] = "企查查"
                result_item["update_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                yield result_item
            except Exception:
                print traceback.format_exc()
Beispiel #25
0
 def download_pic(self, response):
     if response.m_response:
         href = bs(response.m_response.content,
                   "lxml").select_one("div.main-image img").attrs["src"]
         yield Request(url=href, callback=self.download, priority=3)
Beispiel #26
0
class Fang_Shop_Processor(BaseProcessor):
    spider_id = 'fang_shop_spider'
    spider_name = 'fang_shop_spider'
    allowed_domains = ['fang.com']
    start_requests = [Request(url='http://shop.fang.com', priority=0)]

    @checkResponse
    def process(self, response):
        city_crawl_list = {
            u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'
        }
        soup = bs(
            '''<a href="http://shop1.fang.com/" style="width:40px;padding:4px 0 4px 8px;">北京</a>
                     <a href="http://shop.sh.fang.com/" style="width:40px;padding:4px 0 4px 8px;">上海</a>
                     <a href="http://shop.gz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">广州</a>
                     <a href="http://shop.sz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">深圳</a>
                     <a href="http://shop.tj.fang.com/" style="width:40px;padding:4px 0 4px 8px;">天津</a>
                     <a href="http://shop.cq.fang.com/" style="width:40px;padding:4px 0 4px 8px;">重庆</a>
                     <a href="http://shop.cd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">成都</a>
                     <a href="http://shop.suzhou.fang.com/" style="width:40px;padding:4px 0 4px 8px;">苏州</a>
                     <a href="http://shop.wuhan.fang.com/" style="width:40px;padding:4px 0 4px 8px;">武汉</a>
                     <a href="http://shop.xian.fang.com/" style="width:40px;padding:4px 0 4px 8px;">西安</a>
                     <a href="http://shop.dg.fang.com/" style="width:40px;padding:4px 0 4px 8px;">东莞</a>
                     <a href="http://shop.km.fang.com/" style="width:40px;padding:4px 0 4px 8px;">昆明</a>
                     <a href="http://shop.hz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">杭州</a>
                     <a href="http://shop.jn.fang.com/" style="width:40px;padding:4px 0 4px 8px;">济南</a>
                     <a href="http://shop.wuxi.fang.com/" style="width:40px;padding:4px 0 4px 8px;">无锡</a>
                     <a href="http://shop.zz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">郑州</a>
                     <a href="http://shop.nc.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南昌</a>
                     <a href="http://shop.qd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">青岛</a>
                     <a href="http://shop.sjz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">石家庄</a>
                     <a href="http://shop.nanjing.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南京</a>
                     <a href="http://shop.dl.fang.com/" style="width:40px;padding:4px 0 4px 8px;">大连</a>''',
            'lxml')
        city__list = soup.select('a')
        for city in city__list:
            city_name = city.text
            if city_name in city_crawl_list:
                url = city['href']
                request = Request(url=url,
                                  priority=1,
                                  callback=self.process_page_1)
                request.meta['city'] = city_name
                yield request

    @checkResponse
    def process_page_1(self, response):
        soup = bs(response.m_response.content, 'lxml')
        district_list = soup.select('div.qxName a')
        district_list.pop(0)
        for district in district_list:
            district_name = district.text
            url = response.request.url + district['href']
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = district_name
            yield request

    @checkResponse
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            estate = detail.select('p.mt15 span.spName')[0].text
            detail_str = detail.select('p.mt10')[0].text

            temp_list = detail.select('p.mt10')[0].text.split('/')
            temp_list = [temp.strip() for temp in temp_list]

            if '购物中心/百货' not in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '')
                floor = temp_list[1]
                total_floor = temp_list[2].replace('层', '')
            elif '购物中心/百货' not in detail_str and '层' not in detail_str:
                m_type = temp_list[0].strip().replace('类型:', '')
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' not in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = temp_list[2]
                total_floor = temp_list[3].replace('层', '')
            else:
                logger.error('unexpective detail_str: ' + detail_str.strip())

            area = detail.select('div.area')[0].text.replace('㎡', '').replace(
                '建筑面积', '')
            total_price = detail.select(
                'div.moreInfo p.mt5 span.price')[0].text
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['estate'] = estate
            item['floor'] = floor
            item['total_floor'] = total_floor
            item['type'] = m_type
            item['area'] = area
            item['total_price'] = total_price
            item['crawl_date'] = crawl_date

            item['city'] = response.request.meta['city']
            item['district'] = response.request.meta['district']
            item['url'] = response.request.url
            yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href']) + '/'
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Beispiel #27
0
 def process_page_3(self, response):
     soup = bs(response.m_response.content, 'lxml')
     car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
     for car_info in car_info_list:
         url = 'http://www.che168.com' + car_info['href']
         request = Request(url=url,
                           priority=4,
                           callback=self.process_page_4)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = response.request.meta['cars_line']
         yield request
     next_page = soup.find(
         lambda tag: tag.name == 'a' and '下一页' in tag.text)
     if next_page:
         url = 'http://www.che168.com' + next_page['href']
         request = Request(url=url,
                           priority=3,
                           callback=self.process_page_3)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = response.request.meta['cars_line']
         yield request
Beispiel #28
0
class Car_Processor(BaseProcessor):
    spider_id = 'car_spider'
    spider_name = 'car_spider'
    allowed_domains = ['che168.com']
    start_requests = [Request(url='http://www.che168.com', priority=0)]

    @checkResponse
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        province_div_list = soup.select(
            'div.city-list div.cap-city > div.fn-clear')
        for province_div in province_div_list:
            province_name = province_div.select('span.capital a')[0].text
            city_list = province_div.select('div.city a')
            for city in city_list:
                city_name = city.text
                pinyin = city['href'].strip('/').split('/')[0]
                request = Request(
                    url=
                    'http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s'
                    % pinyin,
                    priority=1,
                    callback=self.process_page_1)
                request.meta['province'] = province_name
                request.meta['city'] = city_name
                yield request

    @checkResponse
    def process_page_1(self, response):
        brand_list = list(
            json.loads(response.m_response.content.decode('gb2312')))
        for brand in brand_list:
            brand_dict = dict(brand)
            brand_name = brand_dict['name']
            url = response.nice_join(brand_dict['url']) + '/'
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = brand_name
            yield request

    @checkResponse
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        cars_line_list = soup.select(
            'div#series div.content-area dl.model-list dd a')
        for cars_line in cars_line_list:
            cars_line_name = cars_line.text
            url = 'http://www.che168.com' + cars_line['href']
            request = Request(url=url,
                              priority=3,
                              callback=self.process_page_3)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = response.request.meta['brand']
            request.meta['cars_line'] = cars_line_name
            yield request

    @checkResponse
    def process_page_3(self, response):
        soup = bs(response.m_response.content, 'lxml')
        car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
        for car_info in car_info_list:
            url = 'http://www.che168.com' + car_info['href']
            request = Request(url=url,
                              priority=4,
                              callback=self.process_page_4)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = response.request.meta['brand']
            request.meta['cars_line'] = response.request.meta['cars_line']
            yield request
        next_page = soup.find(
            lambda tag: tag.name == 'a' and '下一页' in tag.text)
        if next_page:
            url = 'http://www.che168.com' + next_page['href']
            request = Request(url=url,
                              priority=3,
                              callback=self.process_page_3)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = response.request.meta['brand']
            request.meta['cars_line'] = response.request.meta['cars_line']
            yield request

    @checkResponse
    def process_page_4(self, response):
        soup = bs(response.m_response.content, 'lxml')
        # <html><head><title>Object moved</title></head><body>
        # <h2>Object moved to <a href="/CarDetail/wrong.aspx?errorcode=5&amp;backurl=/&amp;infoid=21415515">here</a>.</h2>
        # </body></html>
        if len(soup.select('div.car-title h2')) != 0:
            car = soup.select('div.car-title h2')[0].text
            detail_list = soup.select('div.details li')
            if len(detail_list) == 0:
                soup = bs(response.m_response.content, 'html5lib')
                detail_list = soup.select('div.details li')
            mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
            first_borad_date = detail_list[1].select('span')[0].text
            gear = detail_list[2].select('span')[0].text.split('/')[0]
            displacement = detail_list[2].select('span')[0].text.split('/')[1]
            price = soup.select('div.car-price ins')[0].text.replace('¥', '')
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['car'] = car
            item['mileage'] = mileage
            item['first_borad_date'] = first_borad_date
            item['gear'] = gear
            item['displacement'] = displacement
            item['price'] = price
            item['crawl_date'] = crawl_date

            item['province'] = response.request.meta['province']
            item['city'] = response.request.meta['city']
            item['brand'] = response.request.meta['brand']
            item['cars_line'] = response.request.meta['cars_line']
            yield item