class TroryeActivitySpider(BaseSpider): name = 'ta' allowed_domains = ['u.tourye.com/'] start_urls = get_start_urls('www.tourye.com') def parse_item(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//*[@id="mainarea"]/h2/a[2]/text()').extract()[0] picurl = hxs.select( '//*[@id="content"]/div[1]/div/div[1]/a/img/@src').extract()[0] picurl = 'http://u.tourye.com/' + picurl leader_name = hxs.select( '//*[@id="content"]/div[1]/div/div[2]/dl/dd/a/text()').extract()[0] leader_url = hxs.select( '//*[@id="content"]/div[1]/div/div[2]/dl/dd/a/@href').extract()[0] leader_url = 'http://u.tourye.com/' + leader_url info = hxs.select('//*[@id="content"]/div[1]/div/div[2]/dl/dd') get_info = lambda x: info[x].select('text()').extract()[0] try: remainnumber = get_info(6).split()[2] except: remainnumber = 0 eventnumber = get_info(6).split()[0] try: eventnumber = int(eventnumber) except: eventnumber = 9999 item = TaoyeItem() item['activity_link'] = response.url item['imgurl'] = picurl item['subject'] = title item['leaderuname'] = leader_name item['leaderurl'] = leader_url item['activitytype'] = get_info(1) item['destplace'] = get_info(2).split()[-1] item['starttimefrom'] = time.mktime( time.strptime(get_info(3).split('-')[0].strip(), '%Y.%m.%d %H:%M')) item['starttimeto'] = time.mktime( time.strptime( get_info(3).split('-')[-1].strip(), '%Y.%m.%d %H:%M')) item['contact'] = hxs.select( '//*[@id="content"]/div[1]/div/div[2]/dl/dd[6]/strong/text()' ).extract()[0] item['max_apply'] = eventnumber item['remainnumber'] = remainnumber item['views_number'] = hxs.select( '//*[@id="content"]/div[1]/div/div[2]/ul[1]/li[1]/text()').extract( )[0].split()[0] item['applynumber'] = hxs.select( '//*[@id="content"]/div[1]/div/div[2]/ul[1]/li[2]/text()').extract( )[0].split()[0] item['follow_number'] = hxs.select( '//*[@id="content"]/div[1]/div/div[2]/ul[1]/li[3]/text()').extract( )[0].split()[0] # item['event_detail'] = hxs.select('//*[@id="content"]/div[2]/div').extract()[0] return item
class Huwai517ActivitySpider(BaseSpider): name = 'ha' allowed_domains = ['www.517huwai.'] start_urls = get_start_urls('www.517huwai.com') pipeline = set([ pipelines.DetailFilterPipeline, pipelines.DetailSavePipeline, ]) def parse(self, response): print response.url item = ActivityItem()
class LvzhouActivitySpider(BaseSpider): name = 'la' allowed_domains = ['www.lvzhou.info'] start_urls = get_start_urls('www.lvzhou.info') pipeline = set([ pipelines.DetailFilterPipeline, pipelines.DetailSavePipeline, ]) def parse(self, response): hxs = HtmlXPathSelector(response) item = ActivityItem() return item
class ChinaWalkingActivitySpider(BaseSpider): name = 'cw' allowed_domains = ['www.chinawalking.net.cn'] start_urls = get_start_urls('www.chinawalking.net.cn') pipeline = set([ pipelines.DetailFilterPipeline, pipelines.DetailSavePipeline, ]) def parse(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//div[@id="nr_zuo"]/dl/dt/text()').extract()[0] data = hxs.select('//div[@id="nr_zuo"]/dl/dd') get_text = lambda x: data[x].select('text()').extract()[ 0] #.encode('utf8') item = ActivityItem() item['source_site'] = 'http://www.chinawalking.net.cn' item['activity_link'] = response.url item['subject'] = title item['leaderuname'] = get_text(0) item['contact'] = get_text(1) #item['ways'] = get_text(2) #item['activity_strength'] = get_text(3) item['destplace'] = get_text(4) item['starttimefrom'] = 0 # get_text(5) item['starttimeto'] = 1 #get_text(5) #item['gather_location'] = get_text(6) #item['fee'] = get_text(7) #item['people_limit'] = get_text(8) #item['expired_time'] = get_text(9) #item['activity_type'] = get_text(10) #item['commercial_type'] = get_text(11) return item
class Www8264ActivitySpider(BaseSpider): name = '8a' allowed_domains = ['www.8264.com', 'u.8264.com'] start_urls = get_start_urls('www.8264.com') def parse(self, response): item = W8264() lvyeItem = LvyeItem() hxs = HtmlXPathSelector(response) try: error = hxs.select('//*[@id="postlist"]/div[1]/table/tr[1]/td[2]/div[3]/div[3]/div/em').extract()[0] print 'the activity is deleted :' + response.url log.msg('the activity is deleted :' + response.url, log.WARNING) return except: pass try: lvyeItem['subject'] = hxs.select('//*[@id="thread_subject"]/text()').extract()[0] except: print 'no subject for activity :' + response.url log.msg('no subject for activity :' + response.url, log.WARNING) return lvyeItem['activity_link'] = response.url lvyeItem['imgurl'] = ''; try: lvyeItem['imgurl'] = hxs.select('//*[@id="postlist"]/div[1]/table/tr[1]/td[2]/div[3]/div[3]/div[1]/div[1]/div[1]/a/img/@src').extract()[0] except: log.msg('the activity no photo :' + response.url, log.INFO) dls = hxs.select('//*[@id="postlist"]/div[1]/table/tr[1]/td[2]/div[3]/div[3]/div[1]/div[1]/div[2]//dt') dds = hxs.select('//*[@id="ct"]/div[2]/div[1]/table/tr[1]/td[2]/div[3]/div[3]/div[1]/div[1]/div[2]//dd') hdtime = '' count = 0 remainnumber = "" expiration = "" for dl in dls: dltexts = dl.select('text()').extract() if len(dltexts) > 0: if dltexts[0].count(u'活动类型'): lvyeItem['activitytype'] = dds[count].select('strong/text()').extract()[0] if dltexts[0].count(u'开始时间'): hdtime = dds[count].select('text()').extract()[0] if dltexts[0].count(u'活动地点'): lvyeItem['destplace'] = dds[count].select('text()').extract()[0] if dltexts[0].count(u'已报名人数'): lvyeItem['applynumber'] = dds[count].select('em/text()').extract()[0] if dltexts[0].count(u'剩余名额'): remainnumber = dds[count].select('text()').extract()[0] ptemp = re.compile('[0-9]+') remainnumber = ptemp.findall(remainnumber)[0] lvyeItem['remainnumber'] = remainnumber if dltexts[0].count(u'报名截止'): expiration = dds[count].select('text()').extract()[0] expiration = time.mktime(time.strptime(expiration, '%Y-%m-%d %H:%M')) lvyeItem['expiration'] = expiration if dltexts[0].count(u'每人花销'): price = dds[count].select('text()').extract()[0] ptemp = re.compile('[0-9]+') tmp = ptemp.search(price) if tmp: lvyeItem['price'] = tmp.group(0) count = count+1 if hdtime.count(u'前'): print 'Activities have already begun :'+ response.url log.msg('Activities have already begun :'+ response.url, log.WARNING) return p = re.compile('[0-9]+-[0-9]+-[0-9]+ [0-9]+:[0-9]+') times = p.findall(hdtime) starttimefrom = 0 starttimeto = 0 if len(times) < 1: return try: if len(times) <= 1: lvyeItem['starttimefrom'] = starttimefrom = time.mktime(time.strptime(times[0], '%Y-%m-%d %H:%M')) else: lvyeItem['starttimefrom'] = starttimefrom = time.mktime(time.strptime(times[0], '%Y-%m-%d %H:%M')) lvyeItem['starttimeto'] = starttimeto = time.mktime(time.strptime(times[1], '%Y-%m-%d %H:%M')) except: print 'the date formate is error :'+ response.url log.msg('The Activity date formate is error :'+ response.url, log.ERROR) return lvyeItem['source_site'] = u'户外资料网' return lvyeItem def err_info(self, response): self.linkDB.update_info_url(response.url, 'false')
class YouxiakeActivitySpider(BaseSpider): name = 'ya' allowed_domains = ['xia.youxiake.com'] start_urls = get_start_urls('xia.youxiake.com') def parse(self, response): if response.status == 200: self.linkDB.update_info_url(response.url, 'true') else: self.linkDB.update_info_url(response.url, 'false') lvyeItem = LvyeItem() item = YouxiakeItem() p = re.compile('[0-9]+') x = HtmlXPathSelector(response) #item['url'] = url lvyeItem['activity_link'] = response.url lvyeItem['imgurl'] = x.select( "//div[@id='actpic']/img/@src").extract()[0] lvyeItem['subject'] = x.select( "//span[@id='rtitle']/text()").extract()[0] lvyeItem['activity_status'] = x.select( "//span[@class='yellowfont']/text()").extract()[0].strip(u'\xa0') atype = x.select( "/html/body/table[4]/tr/td/table[1]/tr/td/table[2]/tr/td[1]/table[1]/tr[2]/td[2]/table/tr[1]/td[2]/a/text()" ).extract()[0] lvyeItem['activitytype'] = atype.strip(u'\xa0') activity_time = x.select( "/html/body/table[4]/tr/td/table[1]/tr/td/table[2]/tr/td[1]/table[1]/tr[2]/td[2]/table/tr[2]/td[2]/text()" ).extract()[0] times = activity_time.split('~') former = times[0].strip(u'\xa0') start_time = time.mktime(time.strptime(former, '%Y-%m-%d')) lvyeItem['starttimefrom'] = int(start_time) end_time = start_time if len(times) > 1: end_time = time.mktime(time.strptime(times[1], '%Y-%m-%d')) lvyeItem['starttimeto'] = int(end_time) destination = x.select( "/html/body/table[4]/tr/td/table[1]/tr/td/table[2]/tr/td[1]/table[1]/tr[2]/td[2]/table/tr[3]/td[2]/a[last()]/text()" ).extract()[0] lvyeItem['destplace'] = destination venue = x.select( "/html/body/table[4]/tr/td/table[1]/tr/td/table[2]/tr/td[1]/table[1]/tr[2]/td[2]/table/tr[3]/td[4]/a[last()]/text()" ).extract()[0] lvyeItem['depart_place'] = venue bnum = x.select( "/html/body/table[4]/tr/td/table[1]/tr/td/table[2]/tr/td[1]/table[1]/tr[2]/td[2]/table/tr[5]/td[2]/text()" ).extract()[0] lvyeItem['views_number'] = bnum.strip(u'\xa0') expense = '0' try: expense = x.select( "/html/body/table[4]/tr/td/table[1]/tr/td/table[2]/tr/td[1]/table[1]/tr[2]/td[2]/table/tr[4]/td[2]/span[1]/text()" ).extract()[0] expense = p.findall(expense)[0] lvyeItem['price'] = int(expense) except: log.msg('the activity the price is null:' + response.url, log.INFO) applynum = '0' p1 = re.compile(u'已申请报名 ([0-9]+) 人'.encode('gbk')) tem = p1.search(response.body) if tem: applynum = tem.group(1) lvyeItem['applynumber'] = int(applynum) unconfirm_num = '0' p2 = re.compile(u'未审核\(([0-9]+)\)人'.encode('gbk')) tem = p2.search(response.body) if tem: unconfirm_num = tem.group(1) lvyeItem['remainnumber'] = unconfirm_num lvyeItem['source_site'] = u'游侠客' return lvyeItem