Ejemplo n.º 1
0
 def parse(self, html):
     auction_list = html.xpath("//dl/dd/a/@href")
     for auction_url in auction_list:
         try:
             url = 'http://www.shjiapai.cn' + auction_url
             auction_res = requests.get(url, headers=self.headers)
             con = auction_res.text
             auction_id = re.search('id/(\d+).html', auction_url).group(1)
             if not check_auction(source=source, auction_id=auction_id):
                 auction = Auction(source=source, auction_type=auction_type)
                 auction.source_html = con
                 auction.auction_id = auction_id
                 auction.auction_name = re.search('楼盘名称.*?">(.*?)</td', con,
                                                  re.S | re.M).group(1)
                 auction.city = '上海'
                 auction.html_type = '房产'
                 auction.start_auction_price = re.search(
                     '预计售价.*?">(.*?)</td', con, re.S | re.M).group(1)
                 auction.floor = re.search('层.*?">(.*?)楼</td', con,
                                           re.S | re.M).group(1)
                 auction.area = re.search('户型面积.*?">(.*?)</td', con,
                                          re.S | re.M).group(1)
                 auction.build_type = re.search('物业类型.*?">(.*?)</td', con,
                                                re.S | re.M).group(1)
                 auction.info = re.search('其它.*?>(.*?)</div', con,
                                          re.S | re.M).group(1)
                 auction.insert_db()
             else:
                 log.info("数据已存在")
         except Exception as e:
             log.error("{}解析失败".format(auction_url))
Ejemplo n.º 2
0
    def get_detail(self, id_, auction_time, html_type, auction_type, province,
                   city, region):
        auction = Auction(source=source, auction_type=auction_type)
        auction.html_type = html_type
        auction.auction_type = auction_type
        auction.province = province
        auction.city = city
        auction.region = region
        detail_url = 'http://www1.rmfysszc.gov.cn/Handle/' + id_ + '.shtml'
        try:
            response = requests.get(detail_url, headers=self.headers)
            html = response.content.decode()
            auction.source_html = html
            info_list = []
            try:
                if 'GetRecord()' in html:
                    tree = etree.HTML(html)
                    auction.auction_name = tree.xpath(
                        '//div[@id="Title"]/h1/text()')[0]
                    start_auction_price = tree.xpath(
                        '//*[@id="price"]/div[1]/span/text()')[0]
                    auction.start_auction_price = self.get_float(
                        start_auction_price)
                    assess_value = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[1]/td/span[2]/text()'
                    )[0]
                    try:
                        auction.assess_value = self.get_float(assess_value)
                    except Exception as e:
                        auction.assess_value = None
                    earnest_money = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[2]/td/span[2]/text()'
                    )[0]
                    auction.earnest_money = self.get_float(earnest_money)
                    announcement_date = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[3]/td/span/text()')[0]
                    announcement_date_ = re.search(': (.*?)$',
                                                   announcement_date,
                                                   re.S | re.M).group(1)
                    auction.announcement_date = datetime.datetime.strptime(
                        announcement_date_, "%Y.%m.%d")
                    auction_level = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[4]/td/span/text()')[0]
                    auction.auction_level = re.search(': (.*?)$',
                                                      auction_level,
                                                      re.S | re.M).group(1)
                    court = tree.xpath(
                        '//*[@id="bg1"]/div[2]/table/tr[1]/td/span/text()')[0]
                    auction.court = re.search(': (.*?)$', court,
                                              re.S | re.M).group(1)
                    info_list.append(
                        tree.xpath(
                            'string(//*[@id="bdjs11"])').encode().decode())
                    info_list.append(
                        tree.xpath(
                            'string(//*[@id="jjjl"])').encode().decode())
                    contacts = tree.xpath(
                        '//*[@id="bg1"]/div[2]/table/tr[2]/td/span/text()')[0]
                    auction.contacts = re.search(': (.*?)$', contacts,
                                                 re.S | re.M).group(1)
                    phone_number = tree.xpath(
                        '//*[@id="bg1"]/div[2]/table/tr[3]/td/span/text()')[0]
                    auction.phone_number = re.search(': (.*?)$', phone_number,
                                                     re.S | re.M).group(1)
                    auction.info = info_list
                    try:
                        auction.build_type = tree.xpath(
                            '//*[@id="bdjs11"]/table[1]/tr[2]/td[4]/text()')[0]
                    except Exception as e:
                        auction.build_type = None
                    auction.auction_id = id_
                    auction.auction_time = self.get_date(date=auction_time)
                    auction.insert_db()
                elif 'bmnumber()' in html:
                    tree = etree.HTML(html)
                    auction.auction_name = tree.xpath(
                        '//div[@id="Title"]/h1/text()')[0]
                    start_auction_price = tree.xpath(
                        '//*[@id="price"]/div[1]/span/text()')[0]
                    auction.start_auction_price = self.get_float(
                        start_auction_price)
                    assess_value = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[1]/td/span[2]/text()'
                    )[0]
                    auction.assess_value = self.get_float(assess_value)
                    earnest_money = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[2]/td/span[2]/text()'
                    )[0]
                    auction.earnest_money = self.get_float(earnest_money)
                    announcement_date = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[3]/td/span/text()')[0]
                    announcement_date_ = re.search(': (.*?)$',
                                                   announcement_date,
                                                   re.S | re.M).group(1)
                    auction.announcement_date = datetime.datetime.strptime(
                        announcement_date_, "%Y-%m-%d")
                    auction_level = tree.xpath(
                        '//*[@id="bg1"]/div[1]/table/tr[4]/td/span/text()')[0]
                    auction.auction_level = re.search(': (.*?)$',
                                                      auction_level,
                                                      re.S | re.M).group(1)
                    court = tree.xpath(
                        '//*[@id="bg1"]/div[2]/table/tr[1]/td/span/text()')[0]
                    auction.court = re.search(': (.*?)$', court,
                                              re.S | re.M).group(1)
                    info_list.append(
                        tree.xpath(
                            'string(//*[@id="bdjs"])').encode().decode())
                    contacts = tree.xpath(
                        '//*[@id="bg1"]/div[2]/table/tr[2]/td/span/text()')[0]
                    auction.contacts = re.search(': (.*?)$', contacts,
                                                 re.S | re.M).group(1)
                    phone_number = tree.xpath(
                        '//*[@id="bg1"]/div[2]/table/tr[3]/td/span/text()')[0]
                    auction.phone_number = re.search(': (.*?)$', phone_number,
                                                     re.S | re.M).group(1)
                    auction.info = info_list
                    try:
                        auction.build_type = tree.xpath(
                            '//*[@id="bdjs11"]/table[1]/tr[2]/td[4]/text()')[0]
                    except Exception as e:
                        auction.build_type = None
                    auction.auction_id = id_
                    auction.auction_time = self.get_date(date=auction_time)
                    auction.insert_db()
                else:
                    tree = etree.HTML(html)
                    auction.auction_name = tree.xpath(
                        '//*[@id="xmgg"]/div/div[1]/text()')[0]
                    assess_value = tree.xpath(
                        '/html/body/div[6]/table/tr/td/ul/li[3]/span/text()'
                    )[0]
                    auction.assess_value = self.get_float(assess_value)
                    announcement_date = tree.xpath(
                        '/html/body/div[6]/table/tr/td/ul/li[2]/span/text()'
                    )[0]
                    try:
                        auction.announcement_date = datetime.datetime.strptime(
                            announcement_date, "%Y-%m-%d")
                    except Exception as e:
                        auction.announcement_date = datetime.datetime.strptime(
                            announcement_date, "%Y/%m/%d")
                    auction.court = tree.xpath(
                        '/html/body/div[6]/table/tr/td/ul/li[1]/span/text()'
                    )[0]
                    info_list.append(
                        tree.xpath(
                            'string(//*[@id="bdxx"]/div)').encode().decode())
                    info_list.append(
                        tree.xpath('string(//*[@id="tjzl"]/div/div[2])').
                        encode().decode())
                    auction.contacts = tree.xpath(
                        '/html/body/div[6]/table/tr/td/ul/li[4]/span/text()'
                    )[0]
                    auction.phone_number = tree.xpath(
                        '/html/body/div[6]/table/tr/td/ul/li[5]/span/text()'
                    )[0]
                    auction.info = info_list
                    try:
                        auction.build_type = tree.xpath(
                            '//*[@id="bdxx"]/div/div[2]/table/tr[2]/td[3]/text()'
                        )[0]
                    except Exception as e:
                        auction.build_type = None
                    auction.auction_id = id_
                    auction.auction_time = self.get_date(date=auction_time)
                    auction.insert_db()
            except Exception as e:
                log.error('解析错误,url="{}",e="{}"'.format(detail_url, e))

        except Exception as e:
            log.error('详情页请求错误,url="{}",e="{}"'.format(detail_url, e))