예제 #1
0
    def parse_bobae(self, response):
        index = 0
        for sel in response.xpath(
                '//tbody/tr[@itemtype="http://schema.org/Article"]'):
            item = CommunityItem()

            date_now = datetime.now()

            date_str_tmp = sel.xpath('td[@class="date"]/text()').extract()[0]
            prog = re.compile('[0-9]{2}:[0-9]{2}')
            if prog.match(date_str_tmp):
                date_str = date_now.strftime(
                    '%y/%m/%d') + ' ' + date_str_tmp + ':00'
            else:
                date_str = date_now.strftime(
                    '%y/') + date_str_tmp + ' ' + '00:00:00'

            dateTmp = datetime.strptime(date_str, "%y/%m/%d %H:%M:%S")

            item['source'] = '보배드림'
            item['category'] = 'free'
            title = sel.xpath('td[@class="pl14"]/a/text()').extract()[0]
            item['title'] = title.encode('utf-8')
            item['url'] = "http://www.bobaedream.co.kr" + sel.xpath(
                'td[@class="pl14"]/a/@href').extract()[0]
            item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S")
            item['hits'] = int(
                sel.xpath('td[@class="count"]/text()').extract()[0])

            index += 1
            print '-' * 50
            print item['title']
            print '보배드림' + str(index) + "번째 글 크롤링 완료"

            yield item
예제 #2
0
    def parse(self, response):
        for sel in response.xpath('/html/body/div[4]/div[1]/ul/li'):
            item = CommunityItem()
            uid = sel.xpath('a/@href').re("/xiaoqu/(\d+)/")
            item['uid'] = '' if len(uid) == 0 else uid[0]

            name = sel.css('div.title > a::text').extract()

            item['name'] = '' if len(name) == 0 else name[0]
            item['area'] = sel.css(
                'div.positionInfo > a.district::text').extract()[0]
            item['district'] = sel.css(
                'div.positionInfo > a.bizcircle::text').extract()[0]

            #
            if len(sel.css('div.tagList > span')) > 1:
                subway = sel.css('div.tagList > span:nth-child(2)::text').re(
                    '\D*(\d*)%s(.*)' % '号线'.decode('utf-8'))
            else:
                subway = sel.css('div.tagList > span::text').re(
                    '\D*(\d*)%s(.*)' % '号线'.decode('utf-8'))

                # school ?
            #
            item['subway'] = '' if len(subway) == 0 else subway[0]
            item['station'] = '' if len(subway) == 0 else subway[1]
            #

            time = sel.css('.positionInfo::text')[1].re('(\d+).*')
            #
            item['time'] = '' if len(time) == 0 else time[0]
            # item['buildingType'] = sel.css('.con::text')[0].extract()
            #
            # #
            item['count'] = sel.css(
                'div.sellCount > a > span::text').extract()[0]
            #
            price = sel.css('div.totalPrice > span::text').extract()
            #
            item['price'] = '' if len(price) == 0 else price[0]

            item['volume'] = sel.css('div.houseInfo > a::text')[0].re(
                '90.*(\d+).*')[0]

            yield item

        # next page
        page = response.css('.page-box::attr(page-data)').extract()[0]
        page = json.loads(page)

        if page['curPage'] < page['totalPage']:
            next = page['curPage'] + 1
            replace_reg = re.compile(r'\d+')
            url = replace_reg.sub(str(next), response.url)
            # url = 'http://bj.lianjia.com/xiaoqu/dongcheng/pg' + str(next)
            # print url
            yield scrapy.Request(url, callback=self.parse)
예제 #3
0
    def parse_pann(self, response):
        for sel in response.xpath('///tbody/tr'):
            item = CommunityItem()
            item['source'] = "pann"
            item['category'] = "20s"
            item['title'] = sel.xpath('td/a/text()').extract()[0]
            
            print '-'*50
            print item['title']

            yield item
예제 #4
0
    def parse_bobae(self, response):
        for sel in response.xpath('//tbody/tr[@itemtype="http://schema.org/Article"]'): 
            item = CommunityItem()
            item['source'] = "bobae"
            item['category'] = "free"
            item['title'] = sel.xpath('td[@class="pl14"]/a/text()').extract()[0]
            item['url'] = 'http://www.bobaedrea.co.kr' + sel.xpath('td[@class="pl14"]/a/@href').extract()[0]
            # dateTmp = datetime.strptime(sel.xpath('div[@class="list_time"]/span/span[@class="timestamp"]').extract()[0], "%Y-%m-%d %H:%M:%S")
            # item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S")

            print '='*50
            print item['title']

            yield item
예제 #5
0
     def pasrse_clien(self, response):
          for sel in response.xpath('//*[@id="div_content"]'):
               item = CommunityItem()

               item['souce'] = "클리앙"
               item['category'] = "free"
               item['title'] = sel.xpath('//*[@id="div_content"]/div[8]/div[2]/a[1]/span').extract()[0]
               item['url'] =  "https://www.clien.net" + sel.xpath('//*[@id="div_content"]/div[8]/div[2]/a[1]/@href').extract()[0][:2]
               item['date'] = sel.xpath('//*[@id="div_content"]/div[8]/div[5]/span/span')
               # dateTmp = datetime.strptime(sel.xpath('td/span/@title').extract()[0], "%Y-%n-%d %H:%M:%S ")
               # item['date'] = dateTmp.strftime("Y% - %m - %d %H:%M:%S")

               print ('=' *50)
               print (item['title'])

               yield item
예제 #6
0
파일: main.py 프로젝트: chaoliu/insights
    def parse(self, response):
        for sel in response.xpath('//*[@id="house-lst"]/li'):
            item = CommunityItem()
            uid = sel.xpath('div[2]/h2/a/@href').re("/xiaoqu/(\d+)/")
            item['uid']  = '' if len(uid)==0 else uid[0]

            # print item['uid']
            name = sel.xpath('div[2]/h2/a/text()').extract()
            item['name'] = '' if len(name)==0 else name[0]
            item['area'] = sel.xpath('div[2]/div[1]/div[2]/div/a[1]/text()').extract()[0]
            item['district'] = sel.xpath('div[2]/div[1]/div[2]/div/a[2]/text()').extract()[0]

            subway = sel.css('.fang-subway-ex span::text').re('\D*(\d*)%s(.*)' % '号线'.decode('utf-8'))

            # print subway
            item['subway'] = '' if len(subway) == 0 else subway[0]
            item['station'] = '' if len(subway) == 0 else subway[1]

            time = sel.css('.con::text')[1].re('(\d+).*')

            item['time'] = '' if len(time) == 0 else time[0]
            item['buildingType'] = sel.css('.con::text')[0].extract()

            #
            item['count'] = sel.css('.square .num::text').extract()[0]

            price = sel.css('.price .num::text').extract()

            item['price'] = '' if len(price) == 0 else price[0]
            item['volume'] = sel.css('.laisuzhou::text')[0].re('30.*(\d+).*')[0]
            yield item

        # next page
        page = response.css('.page-box::attr(page-data)').extract()[0]
        page = json.loads(page)

        if page['curPage'] <= page['totalPage']:
            next = page['curPage'] + 1
            replace_reg = re.compile(r'\d+')
            url = replace_reg.sub(str(next), response.url)
            # url = 'http://bj.lianjia.com/xiaoqu/dongcheng/pg' + str(next)
            # print url
            yield scrapy.Request(url, callback=self.parse)
예제 #7
0
    def parse_clien(self, response):
        for sel in response.xpath('//tbody/tr[@class="mytr"]'):
            item = CommunityItem()

            item['source'] = '클량'
            item['category'] = 'free'
            item['title'] = sel.xpath('td[@class="post_subject"]/a/text()').extract()[0]
            item['url'] = 'http://www.clien.net/cs2' + sel.xpath('td[@class="post_subject"]/a/@href').extract()[0][2:]

            dateTmp = datetime.strptime(sel.xpath('td/span/@title').extract()[0], "%Y-%m-%d %H:%M:%S")
            item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S")

            td = sel.xpath('td')
            item['hits'] = int(td[4].xpath('text()').extract()[0])

            #print '='*50
            #print item['title']
            time.sleep(5)

            yield item
예제 #8
0
    def parse_clien(self, response):
        index = 0
        for sel in response.xpath(
                '//html/body/div[2]/div/div[1]/div[5]/div/div'):
            item = CommunityItem()

            item['source'] = '클리앙'
            item['category'] = 'free'
            title = sel.xpath(
                'div/div[@class="list-title"]/a[@class="list-subject"]/text()'
            ).extract()[0].strip()
            item['title'] = title.encode('utf-8')
            #item['title'] = sel.xpath('td[@class="post_subject"]/a/text()').extract_first()
            item['url'] = 'http://www.clien.net' + sel.xpath(
                'div/div[@class="list-title"]/a[@class="list-subject"]/@href'
            ).extract()[0]

            #날짜 들고오기
            dateTmp = datetime.strptime(
                sel.xpath(
                    'div/div[@class="list-time"]/span/span[@class="timestamp"]/text()'
                ).extract()[0], "%Y-%m-%d %H:%M:%S")
            item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S")

            #조회수 들고오기
            hits = sel.xpath('div/div[2]/span/text()').extract()[0].strip()
            item['hits'] = hits
            #item['hits'] = int(td[4].xpath('text()').extract()[0])

            index = index + 1

            print '=' * 50
            print item['title']
            print "클리앙 " + str(index) + "번째 글 크롤링 완료"
            #print str(unicode('한글','euc-kr').encode('euc-kr'))

            yield item