Beispiel #1
0
 def start_requests(self):
     for url in self.start_urls:
         if 'chengjiao' in url:
             yield Request(url, callback=self._parse_sold)
         elif 'ershoufang' in url:
             yield Request(url, callback=self._parse_sale)
         else:
             print(url + '出错!!')
    def _reload_sold(self, response, sold_houses):
        if response.request.meta.get('download_times'):
            download_times = response.request.meta['download_times']
            logger.error(*self.lfm.crawled(
                'Spider', self.name, '({0})再次下载,时间为:'.format(
                    response.request.headers.getRawHeaders('User-Agent')[0]), {
                        'function': '第{0}次'.format(download_times),
                        'request': response.request,
                        'time': time.clock(),
                    }))
            download_times = download_times + 1
        else:
            download_times = 1

        if download_times < 4:
            return Request(response.url,
                           callback=self._parse_sold,
                           meta={
                               'download_times': download_times,
                               'header_flag': True,
                               'last_header': response.request.headers
                           })
        else:
            logger.error(*self.lfm.crawled(
                'Spider', self.name, '重复下载次数已超过最大值,判断此网页没有数据,时间为:', {
                    'function': '第{0}次'.format(download_times),
                    'request': response.request,
                    'time': time.clock(),
                }))
            return None
Beispiel #3
0
    def start_requests(self):

        self.start_urls = [
        'http://www.cffex.com.cn/sj/ccpm/201810/08/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/09/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/10/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/11/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/12/IH.xml',

        'http://www.cffex.com.cn/sj/ccpm/201810/15/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/16/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/17/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/18/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/19/IH.xml',

        'http://www.cffex.com.cn/sj/ccpm/201810/22/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/23/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/24/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/25/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/26/IH.xml',

        'http://www.cffex.com.cn/sj/ccpm/201810/29/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/30/IH.xml',
        'http://www.cffex.com.cn/sj/ccpm/201810/31/IH.xml'

        ]

        for url in self.start_urls:
            yield Request(url, callback=self._parse)
Beispiel #4
0
 def start_requests(self):
     self.start_urls = [
         #"https://sh.lianjia.com/xiaoqu/biyun/",
         "https://sh.lianjia.com/xiaoqu/caolu/"
     ]
     for url in self.start_urls:
         yield Request(url, callback=self._parse)
Beispiel #5
0
    def start_requests(self):
        start_url = list()

        for i in range(20, 30):
            i = str(i)
            u = self.url + i
            start_url.append(u)

        for url in start_url:
            yield Request(url, callback=self._parse)
Beispiel #6
0
 def start_requests(self):
     start_url = list()
     for i in range(1, self._maxnum + 1):
         if i == 1:
             url = self._url
         else:
             i = str(i)
             url = self._url + "pg" + i
         start_url.append(url)
         for url in start_url:
             yield Request(url, callback=self._parse, headers=self.headers)
Beispiel #7
0
 def _parse(self,response):
     seletor = etree.HTML(response.body)
     # 分区的总小区数
     total_number = seletor.xpath("/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0]
     self.total_number_community = total_number
     #  获取分区下属城镇的地址
     part_zone = seletor.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div[2]/a")
     for a in part_zone:
         path = a.get('href')
         name = path.split('/')[-2]
         new_url = urljoin(self.base_url, path)
         yield Request(new_url,callback=self._parse2,meta={"zone_name":name})
Beispiel #8
0
def start_request_03():
    start_url = list()
    url = 'https://www.smzdm.com/homepage/json_more?p='

    for i in range(20, 30):
        i = str(i)
        u = url + i
        start_url.append(u)

    for url in start_url:
        #print(url)
        yield Request(url)
Beispiel #9
0
 def start_requests(self):
     self.start_urls = [
         'https://sh.lianjia.com/xiaoqu/anshan/',  # 157 156
         'https://sh.lianjia.com/xiaoqu/dongwaitan/',  # 144 141
         'https://sh.lianjia.com/xiaoqu/huangxinggongyuan/',  # 159 159
         'https://sh.lianjia.com/xiaoqu/kongjianglu/',
         'https://sh.lianjia.com/xiaoqu/wujiaochang/',
         'https://sh.lianjia.com/xiaoqu/xinjiangwancheng/',
         'https://sh.lianjia.com/xiaoqu/zhoujiazuilu/',
         'https://sh.lianjia.com/xiaoqu/zhongyuan1/'
     ]
     for url in self.start_urls:
         yield Request(url, callback=self._parse)
Beispiel #10
0
    def _parse(self, response):
        seletor = etree.HTML(response.body)
        #  获取下属城镇的小区总页数
        page_number = seletor.xpath(
            "//div[@class='page-box house-lst-page-box']/@page-data")
        self.total_page_number = json.loads(page_number[0])["totalPage"]
        total_xiaoqu_number = seletor.xpath(
            "/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0]
        logger.debug("%s的总页数是%d" % (self.name, self.total_page_number))
        self.result["total_xiaoqu_number"] = [total_xiaoqu_number]

        for i in range(1, self.total_page_number + 1):
            url = response.requset.url + 'pg' + str(i)
            yield Request(url, callback=self._parse2, meta={"page_num": i})
    def _parse_sold(self, response):
        selector = etree.HTML(response.body)

        try:
            sold_houses = self._xpath_filter(
                selector.xpath("//ul[@class='listContent']")).xpath('./li')
            total_num = selector.xpath(
                '//div[@class="total fl"]/span/text()')[0]

            if int(total_num) == 0:
                return self._reload_sold(response, sold_houses)
            else:
                self._resolve_sold(sold_houses, response.url)
                if int(total_num) > len(sold_houses):

                    if not re.search('pg', response.url):
                        print("sold:" + self.name + ': ' + response.url +
                              ': ' + str(total_num) + "===" +
                              str(len(sold_houses)))
                        page_number = selector.xpath(
                            "//div[@class='page-box house-lst-page-box']/@page-data"
                        )
                        total_page_number = json.loads(
                            page_number[0])["totalPage"]
                        base_name = response.url.split('/')[-2]
                        for pg in range(2, total_page_number + 1):
                            url = response.url.replace(
                                base_name, 'pg' + str(pg) + base_name)
                            yield Request(url, callback=self._parse_sold)
                    else:
                        pg = re.findall('pg\d+', response.url)[0]
                        print("sold:" + self.name + '_' + pg + ': ' +
                              response.url + ': ' + str(total_num) + "===" +
                              str(len(sold_houses)))
                        return None
        except Exception as e:
            logger.error(*self.lfm.error(
                'Spider', self.name, '解析房子信息时出现错误', {
                    'request':
                    response.request,
                    'function':
                    'total_num={0} sold_houses={1}'.format(
                        int(total_num), len(sold_houses))
                }),
                         extra={
                             'exception': e,
                             'time': time.clock()
                         })
            return None
Beispiel #12
0
    def _parse(self, response):
        seletor = etree.HTML(response.body)
        #  获取所有分区的名称和url
        all_zone = seletor.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div/a")

        for one_zone in all_zone:
            #  获取一个分区的url
            path = one_zone.get('href')
            #  过滤不需要爬的小区
            if path not in ["/xiaoqu/chongming/", "/xiaoqu/shanghaizhoubian/"]:
                name = path.split('/')[-2]
                new_url = urljoin(self.base_url, path)
                self.all_zones[name] = new_url
                yield Request(new_url,
                              callback=self._parse2,
                              meta={"total_zone_name": name})
Beispiel #13
0
    def _parse_getAllCommunity(self, response):
        seletor = etree.HTML(response.body)
        #  获取下属城镇的小区总页数
        page_number = seletor.xpath(
            "//div[@class='page-box house-lst-page-box']/@page-data")
        self.total_page_number = json.loads(page_number[0])["totalPage"]
        total_xiaoqu_number = seletor.xpath(
            "/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0]
        self.result["total_xiaoqu_number"] = [total_xiaoqu_number]
        # logger.critical("%s的总页数是%d" % (self.name, self.total_page_number))

        for i in range(1, self.total_page_number + 1):
            url = self._start_urls[0] + '/pg' + str(i)
            yield Request(url,
                          callback=self._parse_getCommunityInfo,
                          meta={"page_num": i})
Beispiel #14
0
    def start_requests(self):
        start_url = list()

        for i in range(1, self._maxnum):
            if i == 1:
                url = self._url
            else:
                i = str(i)
                url = self._url + "pg" + i
            start_url.append(url)

        for url in start_url:
            yield Request(
                url,
                callback=self._parse,
                headers=self.headers,
                #meta={"download_redirect":True}
            )
Beispiel #15
0
 def _parse(self, response):
     #web_body = BeautifulSoup(response.body,"html.parser")
     seletor = etree.HTML(response.body)
     total_zone = seletor.xpath(
         "/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div/a")
     total_urls = {}
     for a in total_zone:
         path = a.get('href')
         if path not in [
                 "/ershoufang/chongming/", "/ershoufang/shanghaizhoubian/",
                 "/ershoufang/jinshan/"
         ]:
             name = path.split("/")[-2]
             #print(name)
             new_url = urljoin(self.base_url, path)
             #print(new_url)
             total_urls[name] = new_url
     for name, url in total_urls.items():
         yield Request(url,
                       callback=self._parse2,
                       headers=self.headers,
                       meta={'part_name': name})
Beispiel #16
0
 def start_requests(self):
     #for url in self.start_urls:
     yield Request(self.start_urls,callback=self._parse)
Beispiel #17
0
    def start_requests(self):

        self.start_urls = [
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190102.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190103.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190104.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190107.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190108.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190109.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190110.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190111.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190114.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190115.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190116.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190117.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190118.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190121.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190122.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190123.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190124.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190125.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190128.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190129.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190130.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190131.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190201.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190211.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190212.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190213.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190214.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190215.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190218.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190219.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190220.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190221.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190222.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190225.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190226.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190227.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190228.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190301.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190304.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190305.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190306.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190307.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190308.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190311.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190312.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190313.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190314.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190315.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190318.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190319.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190320.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190321.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190322.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190325.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190326.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190327.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190328.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190329.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190401.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190402.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190403.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190404.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190408.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190409.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190410.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190411.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190412.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190415.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190416.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190417.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190418.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190419.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190422.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190423.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190424.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190425.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190426.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190429.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190430.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190506.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190507.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190508.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190509.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190510.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190513.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190514.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190515.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190516.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190517.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190520.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190521.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190522.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190523.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190524.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190527.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190528.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190529.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190530.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190531.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190603.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190604.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190605.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190606.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190610.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190611.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190612.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190613.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190614.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190617.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190618.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190619.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190620.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190621.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190624.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190625.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190626.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190627.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190628.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190701.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190702.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190703.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190704.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190705.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190708.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190709.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190710.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190711.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190712.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190715.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190716.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190717.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190718.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190719.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190722.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190723.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190724.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190725.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190726.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190729.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190730.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190731.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190801.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190802.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190805.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190806.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190807.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190808.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190809.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190812.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190813.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190814.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190815.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190816.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190819.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190820.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190821.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190822.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190823.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190826.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190827.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190828.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190829.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190830.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190902.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190903.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190904.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190905.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190906.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190909.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190910.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190911.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190912.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190916.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190917.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190918.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190919.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190920.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190923.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190924.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190925.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190926.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190927.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20190930.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191008.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191009.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191010.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191011.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191014.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191015.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191016.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191017.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191018.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191021.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191022.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191023.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191024.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191025.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191028.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191029.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191030.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191031.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191101.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191104.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191105.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191106.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191107.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191108.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191111.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191112.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191113.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191114.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191115.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191118.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191119.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191120.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191121.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191122.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191125.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191126.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191127.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191128.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191129.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191202.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191203.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191204.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191205.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191206.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191209.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191210.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191211.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191212.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191213.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191216.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191217.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191218.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191219.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191220.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191223.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191224.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191225.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191226.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191227.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191230.dat',
            'http://www.shfe.com.cn/data/dailydata/kx/pm20191231.dat'
        ]

        # self.start_urls = [
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191008.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191009.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191010.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191011.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191012.dat',
        #
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181015.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181016.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181017.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181018.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181019.dat',
        #
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181022.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181023.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181024.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181025.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181026.dat',
        #
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181029.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181030.dat',
        # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181031.dat'

        # ]

        for url in self.start_urls:
            yield Request(url, callback=self._parse)
Beispiel #18
0
 def start_requests(self):
     yield Request(self.start_url, callback=self._parse)
Beispiel #19
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url, callback=self._parse_getAllCommunity)
 def start_requests(self):
     yield Request(self.start_urls[0], callback=self._parse_sold)
    def _parse_sold(self, response):
        selector = etree.HTML(response.body)

        try:
            sold_houses = self._xpath_filter(
                selector.xpath("//ul[@class='listContent']")).xpath('./li')
            total_num = selector.xpath(
                '//div[@class="total fl"]/span/text()')[0]
            print("sold:" + self.name + ': ' + response.url + ': ' +
                  str(total_num) + "===" + str(len(sold_houses)))
            # page_number = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-data")
            # if page_number:
            #     total_page_number = json.loads(page_number[0])["totalPage"]
            #     print(total_page_number)
            # else:
            #     print(page_number)
            if int(total_num) == 0:
                return self._reload_sold(response, sold_houses)
            else:
                self._resolve_sold(sold_houses, response.url)
                if int(total_num) > len(sold_houses):
                    # self.result_items += len(sold_houses)
                    if not re.search('pg', response.url):
                        print("sold:" + self.name + ': ' + response.url +
                              ': ' + str(total_num) + "===" +
                              str(len(sold_houses)))
                        page_number = selector.xpath(
                            "//div[@class='page-box house-lst-page-box']/@page-data"
                        )
                        if page_number:
                            total_page_number = json.loads(
                                page_number[0])["totalPage"]
                            base_name = response.url.split('/')[-2]
                            for pg in range(2, total_page_number + 1):
                                url = response.url.replace(
                                    base_name, 'pg' + str(pg) + base_name)
                                yield Request(url, callback=self._parse_sold)
                        else:
                            return self._reload_sold(response, sold_houses)
                    else:
                        pg = re.findall('pg\d+', response.url)[0]
                        print("sold:" + self.name + '_' + pg + ': ' +
                              response.url + ': ' + str(total_num) + "===" +
                              str(len(sold_houses)))
        except Exception as e:
            logger.error(*self.lfm.error(
                'Spider', self.name, '解析房子信息时出现错误:', {
                    'request':
                    response.request,
                    'function':
                    'total_num={0} sold_houses={1}'.format(
                        int(total_num), len(sold_houses)),
                    'exception':
                    e
                }),
                         extra={'time': ',时间为:%6.3f' % time.clock()})

        if len(self.serect_price) > 0:
            while (len(self.serect_price) != 0):
                item = self.serect_price.popitem()
                url = item[1]['sold_house_url']
                yield Request(url,
                              callback=self._get_secret_price,
                              meta={'title': item[0]})
        return None
def request_errback(content):
    print("request_and_response errback")
    print(content)
    return content


def agent_print(content):
    print("agent_print")
    print(type(content))
    print(content)


request = Request(url=url,
                  callback=request_callback,
                  method='get',
                  headers=headers,
                  errback=request_errback,
                  meta={"download_timeout": 2})

settings = Setting()

spider = Spider1.update_settings(settings)

httphandler = HTTPDownloadHandler(settings)
agent = httphandler.download_request(request, spider)
agent.addCallback(agent_print)
agent.addErrback(request_errback)
agent.addBoth(lambda _: reactor.stop())

reactor.run()
Beispiel #23
0
    def _parse_sold(self, response):
        seletor = BeautifulSoup(response.body, "html.parser")
        try:
            base_xpath = './div[@class="info"]'
            total_num = seletor.find('div', class_="total fl").span.text

            sold_houses = seletor.find('ul', class_='listContent')
            if int(total_num) == 0:
                if response.request.meta.get('download_times'):
                    download_times = response.request.meta['download_times']
                    logger.warning(*self.lfm.crawled(
                        'Spider', self.name, '{{0}}再次下载,时间为:'.format(
                            response.request.headers.getRawHeaders(
                                'User-Agent')[0]), {
                                    'function': '第{0}次'.format(download_times),
                                    'request': response.request,
                                    'time': time.clock(),
                                }))
                    download_times = download_times + 1
                else:
                    download_times = 1

                if download_times < 4:
                    return Request(response.url,
                                   callback=self._parse_sold,
                                   meta={
                                       'download_times': download_times,
                                       'header_flag': True,
                                       'last_header': response.request.headers
                                   })
                else:
                    logger.warning(*self.lfm.crawled(
                        'Spider', self.name, '重复下载次数已超过最大值,判断此网页没有数据', {
                            'function': '第{0}次'.format(download_times),
                            'request': response.request,
                            'time': time.clock(),
                        }))
            print("sold:" + self.name + ': ' + response.url + ': ' +
                  str(total_num) + "===" + str(len(sold_houses)))
            return None

            # sold_houses = self._xpath_filter(seletor.xpath("//ul[@class='listContent']")).xpath('./li')
            # total_num = seletor.xpath('//div[@class="total fl"]/span/text()')
            # total_num = seletor.xpath("/html/body/div[5]/div[1]/div[2]/div[1]/span/text()")[0]
            #
            #

            # for sold_house in sold_houses:
            #     sold_title = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="title"]/a/text()'))
            #     print("小区名称:"+sold_title)
            #
            #     sold_address = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="address"]/div[@class="houseInfo"]/text()'))
            #     print("小区地址:"+sold_address)
            #
            #     sold_dealDate = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="address"]/div[@class="dealDate"]/text()'))
            #     print("成交日期:"+sold_dealDate)
            #
            #     sold_totalPrice = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="address"]/div[@class="totalPrice"]/span/text()'))
            #     print("成交价格:"+sold_totalPrice)
            #
            #     sold_unitPrice = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="flood"]/div[@class="unitPrice"]/span/text()'))
            #     print('成交均价:'+sold_unitPrice)
            #
            #     sold_positionInfo = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="flood"]/div[@class="positionInfo"]/text()'))
            #     print("楼层高度:"+sold_positionInfo)
            #
            #     sold_saleonborad = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="dealCycleeInfo"]/span[@class="dealCycleTxt"]/span[1]/text()'))
            #
            #     print("挂牌价:"+sold_saleonborad)
            #
            #     sold_dealcycle = \
            #     self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="dealCycleeInfo"]/span[@class="dealCycleTxt"]/span[2]/text()'))
            #     print("成交周期:"+sold_dealcycle)

        except Exception as e:
            print(e)
            # raise Exception(e)
        return None