コード例 #1
0
ファイル: douban.py プロジェクト: triangle959/JoJoZu_backend
 def parse_item(self, response):
     selector = Selector(response)
     item = DoubanItem()
     item["url"] = response.url
     item["title"] = response.meta.get('title')
     item["author"] = selector.xpath(
         '//h3/span[1]/a/text()').extract_first()
     item["content"] = selector.xpath(
         '//div[@class="topic-richtext"]').extract_first()
     item["image"] = selector.xpath(
         '//div[@class="topic-richtext"]//img/@src').extract()
     item["create_time"] = selector.xpath(
         '//h3/span[2]/text()').extract_first()
     item["text"] = selector.xpath('//div[@class="topic-richtext"]').xpath(
         "normalize-space(.)").extract_first()
     lease = None
     if "整租" in item["title"]:
         lease = "整租"
     elif "合租" in item["title"]:
         lease = "合租"
     if not lease:
         if "整租" in item["text"]:
             lease = "整租"
         elif "合租" in item["text"]:
             lease = "合租"
     item["lease"] = lease
     if item["create_time"].split('-')[1] > response.meta.get(
             'update_time').split('-')[0]:
         item['update_time'], item['update_timestamp'] = time_standard(
             str(int(item["create_time"].split('-')[0]) + 1) + '-' +
             response.meta.get('update_time'))
     else:
         item['update_time'], item['update_timestamp'] = time_standard(
             item["create_time"].split('-')[0] + '-' +
             response.meta.get('update_time'))
     item["replay_num"] = response.meta.get('replay_num')
     for city_name, url_list in city_url.items():
         if response.meta.get('list_url') in url_list:
             item['city'] = city_name
             break
     yield item
コード例 #2
0
ファイル: beike.py プロジェクト: triangle959/JoJoZu_backend
    def parse_item(self, response):
        try:
            self.lock.acquire()
            item = ScrapyJojozuItem()
            item["title"] = response.xpath(
                '//p[@class="content__title"]/text()').extract_first()
            item['lease'] = response.xpath(
                '//ul[@class="content__aside__list"]/li[1]/text()'
            ).extract_first(default="")
            item['type'] = response.xpath(
                '//ul[@class="content__aside__list"]/li[2]/text()'
            ).extract_first().split(' ')[0]
            image_list = response.xpath(
                '//ul[@class="content__article__slide--small content__article__slide_dot"]//img/@src'
            ).extract()
            item["image"] = [i.replace("https", "http") for i in image_list]
            item['payment_method'] = response.xpath(
                '//ul[@class="table_row"]//li[1]/text()').extract_first()
            item['cost'] = int(
                response.xpath(
                    '//ul[@class="table_row"]//li[2]/text()').extract_first())
            item['cash_pledge'] = response.xpath(
                '//ul[@class="table_row"]//li[3]/text()').extract_first()
            address = item["title"].split(' ')[0].split('·')[-1]
            # city 需根据列表页传meta进来
            if 'SZ' in urlparse(response.url)[2]:
                item['city'] = '深圳'
            elif 'GZ' in urlparse(response.url)[2]:
                item['city'] = '广州'
            elif 'SH' in urlparse(response.url)[2]:
                item['city'] = '上海'
            elif 'BJ' in urlparse(response.url)[2]:
                item['city'] = '北京'

            location = self.a.get_geocoder(address, item['city'])
            item['area'] = self.a.get_area(location, item['city'])
            item['location'] = self.a.get_place(location, item['city'])

            item['had_agent'] = 1
            item['service_charge'] = response.xpath(
                '//ul[@class="table_row"]//li[4]/text()').extract_first()
            item['agent_cost'] = response.xpath(
                '//ul[@class="table_row"]//li[5]/text()').extract_first()

            item['support'] = [
                i.strip() for i in response.xpath(
                    '//ul[@class="content__article__info2"]/li[@class="fl oneline  "]/text()'
                ).extract() if i.strip()
            ]
            item['description'] = ''.join(
                response.xpath(
                    '//div[@class="content__article__info"]/ul[1]//text()').
                extract()).replace(" ", "").replace(" ",
                                                    "").replace("\n\n", "\n")
            item['update_time'], item['update_timestamp'] = time_standard(
                response.xpath('//div[@class="content__subtitle"]').re(
                    "\d+-\d+-\d+")[0])
            item['url'] = response.url
            item['source'] = "贝壳"
        except Exception as e:
            traceback.print_exc()
        finally:
            self.lock.release()
        yield item
コード例 #3
0
    def item_parse(self, response):
        print("inter item_parse")
        item = ScrapyJojozuItem()
        item["title"] = response.xpath(
            '//div[@class="title"]/text()').extract_first().replace(
                " ", "").replace(" ", "").replace("\n", "").replace("\r", "")
        # 租赁方式
        item["lease"] = response.xpath(
            '//div[@class="tt"]/text()').extract()[0]
        # 户型
        item["type"] = response.xpath('//div[@class="tt"]/text()').extract()[1]
        # 图片为ArrayList
        item["image"] = [
            i if 'http:' in i else 'http:' + i for i in response.xpath(
                '//div[@class="cont-sty1 clearfix"]//img/@src').extract()
        ]
        # 付款方式
        item["payment_method"] = response.xpath(
            '//div[@class="trl-item sty1"]/text()').extract_first().replace(
                '元/月', "").replace('(', "").replace(')', "")
        # 月租金
        item["cost"] = int(
            response.xpath(
                '//div[@class="trl-item sty1"]/i/text()').extract_first())
        # 押金
        item["cash_pledge"] = response.xpath(
            '//div[@class="trl-item sty1"]/text()').extract_first().replace(
                '元/月', "").replace('(', "").replace(')', "")
        # 区域
        item["area"] = response.xpath(
            '//div[@class="rcont"]/a/text()').extract()[0]
        # 是否有中介
        item["had_agent"] = 1
        # 服务费
        item["service_charge"] = "服务费未知"
        # 中介费
        item["agent_cost"] = "中介费未知"
        # 最近地铁站
        location = response.xpath('//div[@class="rcont"]/a/text()')
        item["location"] = location.re('线(.*?)站')[0] if location.re(
            '线(.*?)站') else location.extract_first()
        # 设施
        item["support"] = re.search("var peitao = '(.*?)';",
                                    response.text).group(1)
        item["description"] = response.xpath(
            '//li[@class="font14 fyld"]/div[@class="fyms_con floatl gray3"]'
        ).xpath('string(.)').extract_first()
        # 更新时间以及更新时间戳
        item['update_time'], item['update_timestamp'] = time_standard(
            response.xpath('//div[@class="gray9 fybh-zf"]/span[2]/text()').
            extract_first().replace("更新时间", "").replace(" ", ""))

        item["url"] = response.url
        # 来源渠道
        item["source"] = "房天下"
        if "sz.zu" in response.url:
            item['city'] = '深圳'
        elif "gz.zu" in response.url:
            item['city'] = '广州'
        elif "sh.zu" in response.url:
            item['city'] = '上海'
        elif "bj.zu" in response.url:
            item['city'] = '北京'
        yield item
コード例 #4
0
ファイル: anjuke.py プロジェクト: triangle959/JoJoZu_backend
 def parse_item(self, response):
     if response.status == 302:
         return
     try:
         print('开始加锁')
         self.lock.acquire()
         font_src = re.search("src:url\('(.*?)'\)", response.text).group(1)
         font_face = font_src.split("base64,")
         # 字体文件生成,每个页面返回的字体文件都不同,需要持续更新
         if 'ttf' in font_face[0] or 'woff' in font_face[0]:
             b = base64.b64decode(font_face[1])
             with open('anjuke.ttf', 'wb') as f:
                 f.write(b)
         font = TTFont('anjuke.ttf')
         font.saveXML('anjuke.xml')
         # 如果有 cmap 可以拿到替换数字的unicode码,再通过正则匹配到该unicode码进行替换
         cmap = font['cmap'].getBestCmap()
         mapdict = {}
         for i in cmap:
             pat = re.compile(r'(\d+)')
             values = int(re.search(pat, cmap[i])[1]) - 1
             keys = hex(i)
             new_keys = '&#x' + keys[2:] + ';'
             mapdict[new_keys] = values
         print(mapdict)
         right_html = response.text
         for k, v in mapdict.items():
             right_html = right_html.replace(k, str(v))
         soup = BeautifulSoup(right_html, 'lxml')
         item = ScrapyJojozuItem()
         item['title'] = soup.find('h3', attrs={
             'class': 'house-title'
         }).text.replace('\n', "")
         item['type'] = \
         soup.find('div', attrs={'class': 'title-basic-info'}).find_all('span', attrs={'class': 'info-tag'})[
             1].get_text().replace('\n', "").replace(' ', "")
         item['lease'] = soup.find('div',
                                   attrs={
                                       'class': 'title-basic-info'
                                   }).find('li', attrs={
                                       'class': 'rent'
                                   }).text
         item['image'] = [
             i.get('data-src') for i in soup.find(
                 'div', id="room_pic_wrap", attrs={
                     'class': 'switch_list'
                 }).find_all('img')
         ]
         item['payment_method'] = soup.find('li',
                                            attrs={
                                                'class': 'full-line'
                                            }).find('span',
                                                    attrs={
                                                        'class': 'type'
                                                    }).text
         item['cost'] = int(
             soup.find('li', attrs={
                 'class': 'full-line'
             }).find('span', attrs={
                 'class': 'price'
             }).text[:-3])
         # item['cash_pledge'] = soup.find('li', attrs={'class':'full-line'}).find('span',attrs={'class':'price'}).text
         item['cash_pledge'] = item['payment_method']
         try:
             item['area'] = soup.find_all('a',
                                          class_='link',
                                          attrs={'target': True})[0].text
         except:
             print(response.url)
         item['had_agent'] = 1
         item['service_charge'] = "服务费未知"
         item['agent_cost'] = "中介费未知"
         item['location'] = ",".join([
             i.text for i in soup.find_all(
                 'a', class_='link', attrs={'target': True})[1:]
         ])
         item['support'] = [
             i.find('div').text for i in soup.find_all(
                 'li', attrs={'class': re.compile('peitao-item(.*)has')})
         ]
         item['description'] = soup.find('div',
                                         attrs={
                                             'class': 'auto-general'
                                         }).text
         item['update_time'], item['update_timestamp'] = time_standard(
             soup.find('div', attrs={
                 'class': "right-info"
             }).find('b').text)
         item['url'] = response.url
         item['source'] = "安居客"
         if 'sz' in urlparse(response.url)[1]:
             item['city'] = '深圳'
         elif 'gz' in urlparse(response.url)[1]:
             item['city'] = '广州'
         elif 'sh' in urlparse(response.url)[1]:
             item['city'] = '上海'
         elif 'bj' in urlparse(response.url)[1]:
             item['city'] = '北京'
     except Exception as e:
         traceback.print_exc()
     finally:
         print("释放锁")
         self.lock.release()
         yield item