def parse_bobae(self, response): index = 0 for sel in response.xpath( '//tbody/tr[@itemtype="http://schema.org/Article"]'): item = CommunityItem() date_now = datetime.now() date_str_tmp = sel.xpath('td[@class="date"]/text()').extract()[0] prog = re.compile('[0-9]{2}:[0-9]{2}') if prog.match(date_str_tmp): date_str = date_now.strftime( '%y/%m/%d') + ' ' + date_str_tmp + ':00' else: date_str = date_now.strftime( '%y/') + date_str_tmp + ' ' + '00:00:00' dateTmp = datetime.strptime(date_str, "%y/%m/%d %H:%M:%S") item['source'] = '보배드림' item['category'] = 'free' title = sel.xpath('td[@class="pl14"]/a/text()').extract()[0] item['title'] = title.encode('utf-8') item['url'] = "http://www.bobaedream.co.kr" + sel.xpath( 'td[@class="pl14"]/a/@href').extract()[0] item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S") item['hits'] = int( sel.xpath('td[@class="count"]/text()').extract()[0]) index += 1 print '-' * 50 print item['title'] print '보배드림' + str(index) + "번째 글 크롤링 완료" yield item
def parse(self, response): for sel in response.xpath('/html/body/div[4]/div[1]/ul/li'): item = CommunityItem() uid = sel.xpath('a/@href').re("/xiaoqu/(\d+)/") item['uid'] = '' if len(uid) == 0 else uid[0] name = sel.css('div.title > a::text').extract() item['name'] = '' if len(name) == 0 else name[0] item['area'] = sel.css( 'div.positionInfo > a.district::text').extract()[0] item['district'] = sel.css( 'div.positionInfo > a.bizcircle::text').extract()[0] # if len(sel.css('div.tagList > span')) > 1: subway = sel.css('div.tagList > span:nth-child(2)::text').re( '\D*(\d*)%s(.*)' % '号线'.decode('utf-8')) else: subway = sel.css('div.tagList > span::text').re( '\D*(\d*)%s(.*)' % '号线'.decode('utf-8')) # school ? # item['subway'] = '' if len(subway) == 0 else subway[0] item['station'] = '' if len(subway) == 0 else subway[1] # time = sel.css('.positionInfo::text')[1].re('(\d+).*') # item['time'] = '' if len(time) == 0 else time[0] # item['buildingType'] = sel.css('.con::text')[0].extract() # # # item['count'] = sel.css( 'div.sellCount > a > span::text').extract()[0] # price = sel.css('div.totalPrice > span::text').extract() # item['price'] = '' if len(price) == 0 else price[0] item['volume'] = sel.css('div.houseInfo > a::text')[0].re( '90.*(\d+).*')[0] yield item # next page page = response.css('.page-box::attr(page-data)').extract()[0] page = json.loads(page) if page['curPage'] < page['totalPage']: next = page['curPage'] + 1 replace_reg = re.compile(r'\d+') url = replace_reg.sub(str(next), response.url) # url = 'http://bj.lianjia.com/xiaoqu/dongcheng/pg' + str(next) # print url yield scrapy.Request(url, callback=self.parse)
def parse_pann(self, response): for sel in response.xpath('///tbody/tr'): item = CommunityItem() item['source'] = "pann" item['category'] = "20s" item['title'] = sel.xpath('td/a/text()').extract()[0] print '-'*50 print item['title'] yield item
def parse_bobae(self, response): for sel in response.xpath('//tbody/tr[@itemtype="http://schema.org/Article"]'): item = CommunityItem() item['source'] = "bobae" item['category'] = "free" item['title'] = sel.xpath('td[@class="pl14"]/a/text()').extract()[0] item['url'] = 'http://www.bobaedrea.co.kr' + sel.xpath('td[@class="pl14"]/a/@href').extract()[0] # dateTmp = datetime.strptime(sel.xpath('div[@class="list_time"]/span/span[@class="timestamp"]').extract()[0], "%Y-%m-%d %H:%M:%S") # item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S") print '='*50 print item['title'] yield item
def pasrse_clien(self, response): for sel in response.xpath('//*[@id="div_content"]'): item = CommunityItem() item['souce'] = "클리앙" item['category'] = "free" item['title'] = sel.xpath('//*[@id="div_content"]/div[8]/div[2]/a[1]/span').extract()[0] item['url'] = "https://www.clien.net" + sel.xpath('//*[@id="div_content"]/div[8]/div[2]/a[1]/@href').extract()[0][:2] item['date'] = sel.xpath('//*[@id="div_content"]/div[8]/div[5]/span/span') # dateTmp = datetime.strptime(sel.xpath('td/span/@title').extract()[0], "%Y-%n-%d %H:%M:%S ") # item['date'] = dateTmp.strftime("Y% - %m - %d %H:%M:%S") print ('=' *50) print (item['title']) yield item
def parse(self, response): for sel in response.xpath('//*[@id="house-lst"]/li'): item = CommunityItem() uid = sel.xpath('div[2]/h2/a/@href').re("/xiaoqu/(\d+)/") item['uid'] = '' if len(uid)==0 else uid[0] # print item['uid'] name = sel.xpath('div[2]/h2/a/text()').extract() item['name'] = '' if len(name)==0 else name[0] item['area'] = sel.xpath('div[2]/div[1]/div[2]/div/a[1]/text()').extract()[0] item['district'] = sel.xpath('div[2]/div[1]/div[2]/div/a[2]/text()').extract()[0] subway = sel.css('.fang-subway-ex span::text').re('\D*(\d*)%s(.*)' % '号线'.decode('utf-8')) # print subway item['subway'] = '' if len(subway) == 0 else subway[0] item['station'] = '' if len(subway) == 0 else subway[1] time = sel.css('.con::text')[1].re('(\d+).*') item['time'] = '' if len(time) == 0 else time[0] item['buildingType'] = sel.css('.con::text')[0].extract() # item['count'] = sel.css('.square .num::text').extract()[0] price = sel.css('.price .num::text').extract() item['price'] = '' if len(price) == 0 else price[0] item['volume'] = sel.css('.laisuzhou::text')[0].re('30.*(\d+).*')[0] yield item # next page page = response.css('.page-box::attr(page-data)').extract()[0] page = json.loads(page) if page['curPage'] <= page['totalPage']: next = page['curPage'] + 1 replace_reg = re.compile(r'\d+') url = replace_reg.sub(str(next), response.url) # url = 'http://bj.lianjia.com/xiaoqu/dongcheng/pg' + str(next) # print url yield scrapy.Request(url, callback=self.parse)
def parse_clien(self, response): for sel in response.xpath('//tbody/tr[@class="mytr"]'): item = CommunityItem() item['source'] = '클량' item['category'] = 'free' item['title'] = sel.xpath('td[@class="post_subject"]/a/text()').extract()[0] item['url'] = 'http://www.clien.net/cs2' + sel.xpath('td[@class="post_subject"]/a/@href').extract()[0][2:] dateTmp = datetime.strptime(sel.xpath('td/span/@title').extract()[0], "%Y-%m-%d %H:%M:%S") item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S") td = sel.xpath('td') item['hits'] = int(td[4].xpath('text()').extract()[0]) #print '='*50 #print item['title'] time.sleep(5) yield item
def parse_clien(self, response): index = 0 for sel in response.xpath( '//html/body/div[2]/div/div[1]/div[5]/div/div'): item = CommunityItem() item['source'] = '클리앙' item['category'] = 'free' title = sel.xpath( 'div/div[@class="list-title"]/a[@class="list-subject"]/text()' ).extract()[0].strip() item['title'] = title.encode('utf-8') #item['title'] = sel.xpath('td[@class="post_subject"]/a/text()').extract_first() item['url'] = 'http://www.clien.net' + sel.xpath( 'div/div[@class="list-title"]/a[@class="list-subject"]/@href' ).extract()[0] #날짜 들고오기 dateTmp = datetime.strptime( sel.xpath( 'div/div[@class="list-time"]/span/span[@class="timestamp"]/text()' ).extract()[0], "%Y-%m-%d %H:%M:%S") item['date'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S") #조회수 들고오기 hits = sel.xpath('div/div[2]/span/text()').extract()[0].strip() item['hits'] = hits #item['hits'] = int(td[4].xpath('text()').extract()[0]) index = index + 1 print '=' * 50 print item['title'] print "클리앙 " + str(index) + "번째 글 크롤링 완료" #print str(unicode('한글','euc-kr').encode('euc-kr')) yield item