def post_spider(self): logger.info('爬虫结束后的操作') data_operator = DataOperator() data_operator.delete_data_dated() data_operator.data_unique() data_operator.deduplication() data_operator.mysql_close()
def get_items(sid, current_date): items = [] url = 'http://cgi.yanchu.qq.com/cgi-bin/yanchu/mb_api/jsondata.fcg?g_tk=4d3754f563ad04a56fece81bbcc83302&cbk=callback&sCmd=citytype&IDS=0%2C26&page=0&_=1446602940456' data = get_data(url, str(1)) if data and data['data']['page_data']: pages = int(data['data']['page_tol']) print '共%s页'%pages for page in range(pages): print page url = 'http://cgi.yanchu.qq.com/cgi-bin/yanchu/mb_api/jsondata.fcg?g_tk=4d3754f563ad04a56fece81bbcc83302&cbk=callback&sCmd=citytype&IDS=0%2C26&page='+str(page)+'&_=1446602940456' data = get_data(url, str(page+1)) if not data or not data['data']['page_data']: print('未找到第%s页的数据'%page) continue print data for i in data['data']['page_data']: item = {'sid':sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':' ', 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'} item['city'] = i['city'] item['id'] = i['show_id'] item['title'] = i['show_name'] if len(i['show_time']) > 19: begin_time = i['show_time'].split(',')[0].split(' ')[0] end_time = i['show_time'].split(',')[-1].split(' ')[0] else: begin_time = i['show_time'][:10] end_time = begin_time item['begin_date'] = begin_time item['end_date'] = end_time item['venue'] = i['hall_name'] items.append(item) opera = DataOperator() opera.item_insert(data=items) return items
def get_items(self, page=1): data = self.get_html(page=page) numbers = re.findall('<div class="newFind">.*?(\d+).*?</div>', data)[0] # pages = math.ceil(float(numbers)/20) pages = 150 # 爬前150页的数据,后边的基本已过期 for page in range(1, int(pages) + 1): print '共有%s页%s条数据,目前正在抓取第%s页的数据' % (pages, numbers, page) data = self.get_html(page=page) soup = BeautifulSoup(data) events = soup.findAll('div', {'class': 'mlm1r'}) items = [] for event in events: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } href = event.a['href'] id = href.split('/')[-1].strip() title = event.select('li')[0].text.strip() venue = event.select('li')[1].select('span')[1].text.strip() date = event.select('li')[1].select('span')[0].text.strip() begin_date = date.split('-')[0].replace('.', '-') end_date = date.split('-')[1].replace('.', '-') if ''.join(begin_date.split('-')[:2]) > self.current_date: '''try: city = ''.join(jieba.analyse.extract_tags(venue, allowPOS=['ns'])) except Exception as e: print 'error:', e''' eachData = self.get_html(id=id) try: city = re.findall(u'地址.*?<span>(.*?)</span>', eachData)[0].split()[-1].rstrip(u'市') except IndexError as e: print 'error:', e city = '' item['city'] = city item['begin_date'] = begin_date item['end_date'] = end_date item['id'] = id item['title'] = title item['venue'] = venue items.append(item) print id, title, city, begin_date, end_date, venue else: print 'ID为%s的数据过期' % id print '正在写入第%s页的数据' % (page) opera = DataOperator() opera.item_insert(data=items)
def get_data(self, page, city_name): items = [] print('正在抓取%s第%s页的数据'%(city_name, page)) url = 'http://www.cnena.com/showroom/search.php?mid=1&fid=0&keyword=%s&action=search&type=title&page=%s'%(city_name, str(page)) url = url.decode('utf-8').encode('GBK') print(url) html = self.get_html(url) if html: pattern = re.compile('<tr>.*?<td.*?>(\d+)</td>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?</tr>', re.S) meetings = re.findall(pattern, html) if meetings: for meeting in meetings: item = {'sid':self.sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':city_name, 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'} fid_id_tmp = re.findall('\d+', meeting[1]) fid_id = str(fid_id_tmp[0]) + '-' + str(fid_id_tmp[1]) item['id'] = fid_id title = meeting[2] item['title'] = title industry = meeting[3] item['industry'] = industry h = 'http://www.cnena.com/showroom/'+meeting[1] print(h) html2 = self.get_html(h) if html2: pattern2=re.compile(u'展会概况.*?开幕日期:(.*?)<br>.*?结束日期:(.*?)<br>.*?展会地点.*?<a.*?>(.*?)</a>',re.S) meetings2 = re.search(pattern2, html2) if meetings2: print(meetings2.group(1)) begin_date = re.sub('\D+','-',meetings2.group(1)).strip('-') end_date = re.sub('\D+','-',meetings2.group(2)).strip('-') year = int(begin_date.split('-')[0]) month = int(begin_date.split('-')[1]) if year >= int(self.current_date[:4])+1 or year == int(self.current_date[:4]) and month >= int(self.current_date[-2:]): item['begin_date'] = begin_date item['end_date'] = end_date venue = meetings2.group(3) item['venue'] = venue print(item) items.append(item) print('准备写入第%s页的数据'%page) opera = DataOperator() opera.item_insert(data=items) else: print('%s页的数据为空'%page) return
def get_items(self, city, page='1'): url = 'http://www.zhankoo.com/Search/SearchExhibitionList?city=%s&classifyId=0&ratingOverAll=0&rankType=5&isExhibitionEnd=0&_=1452759283208&pagenumber=%s'%(city, page) print '正在抓取%s第%s页的数据'%(city, page) data = self.get_data(url) soup = BeautifulSoup(data) meetings = soup.findAll('h3', {'class':'deal-tile__title'}) items = [] if meetings: for meeting in meetings: item = {'sid':self.sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':' ', 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'} title = meeting.select('span[class="xtitle"]')[0].a['title'] href = meeting.select('span[class="xtitle"]')[0].a['href'] itemid = href.split('_')[-1].split('.')[0] date_venue_tmp = meeting.select('span[class="short-title"]')[0].text date_venue = ''.join(date_venue_tmp.split()[:-1]) venue = date_venue.split(':')[-1] date_tmp = date_venue.split(':')[1] date = re.split(u"[\u4e00-\u9fa5]+",date_tmp) begin_date = date[0] end_date = date[1] print city, title, itemid, begin_date, end_date, venue item['id'] = itemid item['title'] = title item['city'] = city item['venue'] = venue item['begin_date'] = begin_date item['end_date'] = end_date items.append(item) opera = DataOperator() opera.item_insert(data=items) pattern_next_page = re.compile(u'<a\s+class="next-page".*?href=".*?pagenumber=(\d+)">下一页</a>') try: next_page = re.findall(pattern_next_page, data)[0] except IndexError: print '%s的数据全部抓取完毕'%city return else: print '找到%s的下一页,准备抓取下一页的数据'%city self.get_items(city, next_page) else: print '%s没有数据'%city
def get_items(self, data, month): print('正在解析%s月份的数据' % month) soup = BeautifulSoup(data) meetings = soup.findAll('table', {'id': 'tbl_%s' % month})[0].findAll( 'tr', {'class': 'blue_bg'}) items = [] if meetings: for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': 'm190003', 'title': ' ', 'industry': '财经', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } url = meeting.select('td')[0].a['href'] if url: item['site'] = url title = meeting.select('td')[0].a.string if title: item['title'] = title organizer = meeting.select('td')[1].string if organizer: item['organizer'] = organizer date = meeting.select('td')[2].string if len(date) > 12: begin_date_tmp = date.split('-')[0] begin_date = re.sub('[^\d]', '-', begin_date_tmp).rstrip('-') end_date_tmp = date.split('-')[1] end_date_tmp2 = re.sub('[^\d]', '-', end_date_tmp).rstrip('-') if len(end_date_tmp ) <= 3: #如果len(end_date_temp)=3,说明结束日期只有日期,没有年份和月份 end_date = begin_date.replace( begin_date.split('-')[-1], end_date_tmp2) #把开始日期的日期数值换成结束日期的值 elif len(end_date_tmp ) <= 6: #如果len(end_date_temp)=6,说明结束日期有月份和日期,没有年份 end_date = begin_date.split( '-')[0] + '-' + end_date_tmp2 #把开始日期的年份数值与结束日期连接起来 else: end_date = end_date_tmp2 else: begin_date = end_date = re.sub('[^\d]', '-', date).rstrip('-') item['begin_date'] = begin_date item['end_date'] = end_date city = meeting.select('td')[3].string if city: item['city'] = city item['venue'] = city print(item) items.append(item) opera = DataOperator() opera.item_insert(data=items) else: print '%s年%s月没有数据!' % (self.current_date[:4], month) return items
def get_data(self, data): pattern_items = re.compile( '<div.*?class="sslist">.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<p>(.*?)</p>.*?<p.*?class="cg">(.*?)</p>', re.S) meetings = re.findall(pattern_items, data) items = [] items_history = [] for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } print meeting[0] id = meeting[0].split('/')[-1].split('_')[0] print id item['id'] = id title = meeting[1].strip() item['title'] = title city = meeting[2].strip() item['city'] = city try: venue = meeting[3].split('>')[1].split('<')[0] except IndexError: venue = meeting[3].strip() item['venue'] = venue h = 'http://www.eshow365.com' + meeting[0] print h html2 = self.get_html(h) pattern_time = re.compile('举办时间:(.*?)---(.*?)</p>', re.S) pattern_organizer = re.compile('主办单位:(.*?)</p>') pattern_industry = re.compile('所属行业:(.*?)</a>') pattern_area = re.compile('展会面积:(\d+).*?</p>') time_tmp = re.findall(pattern_time, html2) begin_time_tmp = time_tmp[0][0].replace('/', '-') begin_date = datetime.datetime.strptime( begin_time_tmp, '%Y-%m-%d').strftime('%Y-%m-%d') end_time_tmp = time_tmp[0][1].replace('/', '-') end_date = datetime.datetime.strptime( end_time_tmp, '%Y-%m-%d').strftime('%Y-%m-%d') item['begin_date'] = begin_date item['end_date'] = end_date try: org = re.findall(pattern_organizer, html2)[0].split(' ')[0] except IndexError: org = ' ' item['organizer'] = org industry_tmp = re.findall(pattern_industry, html2) if industry_tmp: try: indus = industry_tmp[0].split('>')[1].split('<')[0] except IndexError: indus = industry_tmp[0].strip() else: indus = ' ' print indus item['industry'] = indus try: area = re.findall(pattern_area, html2)[0] except IndexError: area = ' ' item['area'] = area soup = BeautifulSoup(html2) try: history_exhibitions = soup.findAll( 'div', {'class': 'ljzh'})[0].select('tr')[1:] except IndexError: print '没有找到历届展会信息' history_info_tag = '0' else: print '找到历届展会信息' history_info_tag = '1' for history_exhibition in history_exhibitions: item_history = {} history_exhibition_info = history_exhibition.select('td') history_exhibition_title = history_exhibition_info[0].a[ 'title'].strip() print history_exhibition_title history_exhibition_url = history_exhibition_info[0].a[ 'href'] history_exhibition_id = history_exhibition_url.split( '/')[-1].split('_')[0] print history_exhibition_id try: history_exhibition_venue = history_exhibition_info[ 1].stripped_strings.next() except StopIteration: history_exhibition_venue = ' ' print history_exhibition_venue history_exhibition_date = history_exhibition_info[ 2].string.strip().replace('/', '-') print history_exhibition_date history_exhibition_area_tmp = history_exhibition_info[ 3].span.string.strip() history_exhibition_area = filter( lambda x: x.isdigit(), history_exhibition_area_tmp) print history_exhibition_area item_history['sid'] = self.sid item_history['itemid'] = id item_history['history_itemid'] = history_exhibition_id item_history['title'] = history_exhibition_title item_history['venue'] = history_exhibition_venue date_tmp = history_exhibition_date date = datetime.datetime.strptime( date_tmp, '%Y-%m-%d').strftime('%Y-%m-%d') item_history['date'] = date item_history['area'] = history_exhibition_area items_history.append(item_history) item['history_info_tag'] = history_info_tag items.append(item) opera = DataOperator() opera.item_insert(data=items, data_history=items_history) return
def get_items(self, start_date, page='1'): url = 'http://www.foodmate.net/exhibit/search.php?kw=&fields=0&fromdate=%s&todate=&catid=0&process=0&order=0&x=59&y=12&page=%s' % ( start_date, page) data = self.get_data(url) soup = BeautifulSoup(data) meetings = soup.findAll('div', {'class': 'list'}) print meetings items = [] if meetings: for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } meeting = meeting.ul href = meeting.select('li')[0].a['href'] itemid = href.split('-')[-1].split('.')[0] title = meeting.select('li')[0].a.string venue_tmp = meeting.select('li')[1].string venue = venue_tmp.split(':')[-1] organizer_tmp = meeting.select('li')[2].string organizer = organizer_tmp.split(':')[-1] date_tmp = meeting.select('li')[3].string begin_date = date_tmp.split('~')[0].strip() end_date = date_tmp.split('~')[1].strip() each_meeting_url = href each_meeting_data = self.get_data(each_meeting_url) pattern_city = re.compile(u'展出城市.*?<a.*?>(.*?)</a>', re.S) pattern_site = re.compile( u'<div.*?id="content">.*?网址.*?<a.*?>(.*?)</a>.*?<br>', re.S) try: city = re.findall(pattern_city, each_meeting_data)[0] except IndexError: city = ' ' try: site = re.findall(pattern_site, each_meeting_data)[0] except IndexError: site = ' ' print itemid, title, city, begin_date, end_date, organizer, venue, site item['id'] = itemid item['title'] = title item['city'] = city item['venue'] = venue item['begin_date'] = begin_date item['end_date'] = end_date item['organizer'] = organizer item['site'] = site items.append(item) opera = DataOperator() opera.item_insert(data=items) pattern_next_page = re.compile( u'<a.*?href=".*?page=(\d+)"\s+title="下一页">') try: next_page = re.findall(pattern_next_page, data)[0] except IndexError: print '全部抓取完毕' return else: print '找到下一页,准备抓取下一页的数据' self.get_items(start_date, next_page)
def get_items(self, year, month): url = 'http://www.chemsoc.org.cn/Meeting/Home/search.asp?mingcheng=&province=&y=%s&m=%s' % ( year, month) print url try: data = self.get_data(url) pattern_pages = re.compile(u'第\d+页.*?共(\d+)页') pages = re.findall(pattern_pages, data)[0] except Exception as e: print('未找打%s年%s月的数据,error:%s' % (year, month, e)) return if not int(pages): print '%s年%s月没有数据' % (year, month) else: print '%s年%s月共有%s页的数据' % (year, month, pages) for page in range(1, int(pages) + 1): items = [] url = 'http://www.chemsoc.org.cn/Meeting/Home/search.asp?page=%s&mingcheng=&province=&y=%s&m=%s' % ( page, year, month) print url try: data = self.get_data(url) soup = BeautifulSoup(data) meetings = soup.findAll( 'table', {'class': 'meetings'})[0].findAll('tr')[1:] except Exception as e: print('未找打%s年%s月%s页的数据,error:%s' % (year, month, page, e)) continue for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } href = meeting.select('td')[0].a['href'] itemid = href.split('=')[-1] title = meeting.select('td')[0].a['title'] city = meeting.select('td')[1].input['value'] meeting_time = meeting.select('td')[2].input['value'] begin_time_tmp = re.split(u'-|至', meeting_time)[0] meeting_begin_time = re.sub(u'[年月日]', '-', begin_time_tmp).rstrip('-') try: end_time_tmp1 = re.split(u'-|至', meeting_time)[1] except IndexError: meeting_end_time = meeting_begin_time else: end_time_tmp2 = re.sub(u'[年月日]', '-', end_time_tmp1).rstrip('-') if len(end_time_tmp2) <= 2: if int(end_time_tmp2) < int( meeting_begin_time.split('-')[-1]): meeting_end_time = meeting_begin_time.split( '-')[0] + '-' + str( int(meeting_begin_time.split('-')[1]) + 1) + end_time_tmp2 else: meeting_end_time = meeting_begin_time.replace( meeting_begin_time.split('-')[-1], end_time_tmp2) elif len(end_time_tmp2) <= 4: meeting_end_time = meeting_begin_time.split( '-')[0] + '-' + end_time_tmp2 print itemid, city, title, meeting_begin_time, meeting_end_time item['id'] = itemid item['title'] = title item['city'] = city try: begin_date = datetime.datetime.strptime( meeting_begin_time, '%Y-%m-%d').strftime('%Y-%m-%d') except ValueError: begin_date = meeting_begin_time item['begin_date'] = begin_date try: end_date = datetime.datetime.strptime( meeting_end_time, '%Y-%m-%d').strftime('%Y-%m-%d') except ValueError: end_date = meeting_end_time item['end_date'] = end_date each_meeting_url = 'http://www.chemsoc.org.cn/Meeting/Home/' + href print each_meeting_url each_meeting_data = self.get_data(each_meeting_url) if each_meeting_data: pattern_organizer = re.compile(u'<p>主办单位:(.*?)</p>') pattern_visitor = re.compile(u'<p>预计人数:(.*?)</p>') pattern_venue = re.compile(u'<p>地.*?址:(.*?)</p>') try: organizer = re.findall(pattern_organizer, each_meeting_data)[0] except IndexError: organizer = ' ' try: visitor = re.findall(pattern_visitor, each_meeting_data)[0] except IndexError: visitor = ' ' try: venue = re.findall(pattern_venue, each_meeting_data)[0] except IndexError: venue = city item['organizer'] = organizer item['visitor'] = visitor item['venue'] = venue items.append(item) opera = DataOperator() opera.item_insert(data=items) return
def get_items(self, page): print '正在抓取第%s页的数据' % page data = self.get_data(page).split('</html>')[1] if data: soup = BeautifulSoup(data) items = [] meetings = soup.select('table[class="block1"] tr td tr')[1:] for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } date_tmp1 = meeting.select('td')[3].string date_tmp2 = re.sub(u'[年月日]', '-', date_tmp1).rstrip('-') print date_tmp2 try: date = datetime.datetime.strptime( date_tmp2, '%Y-%m-%d').strftime('%Y-%m-%d') except ValueError: date = date_tmp2 year = int(date.split('-')[0]) month = int(date.split('-')[1]) if year >= int(self.current_date[:4]) + 1 or year == int( self.current_date[:4]) and month >= int( self.current_date[-2:]): id = meeting.select('td')[0].string item['id'] = id title = meeting.select('td')[1].a['title'] item['title'] = title url = meeting.select('td')[1].a['href'] item['site'] = url city = meeting.select('td')[2].string item['city'] = city data2 = urllib2.urlopen(url).read().decode('gbk') pattern_date_loc = re.compile( u'召开时间.*?</span>(.*?)<br>.*?结束时间.*?</span>(.*?)<br>.*?地点.*?</span>(.*?)<br>', re.S) date_loc = re.search(pattern_date_loc, data2) if date_loc: begin_date_tmp = date_loc.group(1).replace('.', '-') try: begin_date = datetime.datetime.strptime( begin_date_tmp, '%Y-%m-%d').strftime('%Y-%m-%d') except ValueError: begin_date = begin_date_tmp item['begin_date'] = begin_date end_date_tmp = date_loc.group(2).replace('.', '-') try: end_date = datetime.datetime.strptime( end_date_tmp, '%Y-%m-%d').strftime('%Y-%m-%d') except ValueError: end_date = end_date_tmp item['end_date'] = end_date loc = ''.join(date_loc.group(3).split()) #地点中有空格 item['venue'] = loc else: item['begin_date'] = item['end_date'] = date item['venue'] = city items.append(item) opera = DataOperator() opera.item_insert(data=items) else: print('未找到第%s页的数据' % page)
current_date_format = self.current_date[:4]+'-'+self.current_date[4:] if begin_date >= current_date_format: end_date_temp = date[1] end_date = datetime.datetime.strptime(end_date_temp,'%Y-%m-%d').strftime('%Y-%m-%d') venue = eachSoup.select('span[class="dico2"]')[1].text item['city'] = city item['begin_date'] = begin_date item['end_date'] = end_date item['id'] = id item['title'] = title item['venue'] = venue items.append(item) print id,title,city,begin_date,end_date,venue else: print 'id为%s的数据已过期'%id opera = DataOperator() opera.item_insert(data=items) pattern_nextPage = re.compile(u'<a href="/zhanhui/class_\d+_(\d+).html">下一页') try: next_page = re.findall(pattern_nextPage, data)[0] except IndexError,e: print '查找完毕' else: print '找到第%s页'%next_page self.get_items(cate=cate, page=next_page) if __name__ == '__main__': current_date = '201608' sid = '36' expowindow = ExpoWindow(sid, current_date)
def getItems(self, page): page = page * 20 data = self.getData(page) if data: print u'成功获取第%s页的数据' % page soup = BeautifulSoup(data, 'lxml') meetings = soup.findAll('ul', {'class': 'mod-meet-lt'})[0].findAll( 'li', recursive=False) items = [] for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } meeting_url = meeting.a['href'] meeting_id = meeting_url.split('/')[-1].split('.')[0] meeting_title = meeting.find('div', { 'class': 'mt-title' }).string.strip() meeting_city = meeting.find('span', { 'class': 'info-city' }).string.strip() meeting_time = meeting.find('span', { 'class': 'info-time' }).string.split()[0] print meeting_title, meeting_url, meeting_id, meeting_city each_meeting_data = self.getData(id=meeting_id) if each_meeting_data: each_meeting_soup = BeautifulSoup(each_meeting_data, 'lxml') try: meeting_date = each_meeting_soup.find( 'li', { 'title': u'活动时间' }).text.strip() except AttributeError: meeting_begin_date = meeting_end_date = meeting_time #continue else: print meeting_date meeting_begin_date = meeting_date.split('~')[0].split( ' ')[0] meeting_end_date = meeting_date.split( '~')[-1].strip().split(' ')[0] year = int(meeting_begin_date.split('-')[0]) month = int(meeting_begin_date.split('-')[1]) if year >= int( self.current_date[:4]) + 1 or year == int( self.current_date[:4]) and month >= int( self.current_date[-2:]): try: meeting_venue = each_meeting_soup.find( 'li', { 'title': u'活动地点' }).text.strip().split()[0] except AttributeError: meeting_venue = '' try: meeting_visitors = each_meeting_soup.find( 'li', { 'title': u'活动人数' }).text.strip().rstrip(u'人') except AttributeError: meeting_visitors = '' try: meeting_organizer = each_meeting_soup.find( 'li', { 'title': u'主办单位' }).text.strip() except AttributeError: meeting_organizer = '' print meeting_id, meeting_title, meeting_city, meeting_begin_date, meeting_end_date, meeting_venue, meeting_visitors, meeting_organizer item['id'] = meeting_id item['title'] = meeting_title item['city'] = meeting_city item['begin_date'] = meeting_begin_date item['end_date'] = meeting_end_date item['venue'] = meeting_venue item['organizer'] = meeting_organizer item['visitor'] = meeting_visitors items.append(item) else: print u'id为%s的数据过期' % meeting_id else: print '未找到id为%s的展会的其他数据' % meeting_id opera = DataOperator() opera.item_insert(data=items) else: print '未找到第%s页的数据' % page
def get_items(self, city_name, city_id, begin_date, end_date): print u'准备爬取%s的数据' % (city_name) try: html = self.get_html(city_id, '1', begin_date, end_date) pages = re.findall(u'共(\d+)页', html)[0] except Exception as e: print('未找到%s的数据,error:%s' % (city_name, e)) return print u'%s共%s页' % (city_name, pages) for page in range(1, int(pages) + 1): items = [] print u'正在爬取%s第%s页' % (city_name, page) try: html = self.get_html(city_id, str(page), begin_date, end_date) pattern = re.compile( u'div.*?class=\"info.*?<strong>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<em.*?class="cgree1">.*?展会时间:(.*?)展馆:(.*?)</a>', re.S) meetings = re.findall(pattern, html)[0:] except Exception as e: print('未找到%s第%s页的数据,error:%s' % (city_name, page, e)) continue for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': city_name, 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } id = meeting[0].split('_')[-1].split('.')[0] item['id'] = id title = meeting[1] item['title'] = title time_tmp = meeting[2] begin_time_tmp1 = time_tmp.split('---')[0] begin_time_tmp2 = re.sub(u'[年月日]', '-', begin_time_tmp1).rstrip('-') try: begin_time = datetime.datetime.strptime( begin_time_tmp2, '%Y-%m-%d').strftime('%Y-%m-%d') except TypeError: begin_time = begin_time_tmp2 item['begin_date'] = begin_time end_time_tmp1 = time_tmp.split('---')[1].split('日')[0] end_time_tmp2 = re.sub(u'[年月日]', '-', end_time_tmp1) if len(end_time_tmp2 ) < 3: #如果len(end_date_temp)=3,说明结束日期只有日期,没有年份和月份 end_time_tmp3 = begin_time.replace( begin_time.split('-')[-1], end_time_tmp2) #把开始日期的日期数值换成结束日期的值 elif len(end_time_tmp2 ) < 6: #如果len(end_date_temp)=6,说明结束日期有月份和日期,没有年份 end_time_tmp3 = begin_time.split( '-')[0] + '-' + end_time_tmp2 #把开始日期的年份数值与结束日期连接起来 else: end_time_tmp3 = end_time_tmp2 try: end_time = datetime.datetime.strptime( end_time_tmp3, '%Y-%m-%d').strftime('%Y-%m-%d') except TypeError: end_time = end_time_tmp3 item['end_date'] = end_time try: venue = meeting[3].split('>')[-1] except IndexError: venue = meeting[3] item['venue'] = venue time.sleep(2) try: each_meeting_html = urllib2.urlopen( 'http://www.onezh.com' + meeting[0]).read().decode('utf-8') pattern_area = re.compile( u'<div.*?class="title-detail">.*?<b>面积</b>.*?(\d+).*?</div>', re.S) area = re.findall(pattern_area, each_meeting_html)[0] except Exception: print('未找到面积数据') area = ' ' item['area'] = area pattern_industry = re.compile( u'<div.*?class="title-detail">.*?所属行业(.*?)</div>', re.S) try: industry = re.findall(pattern_industry, each_meeting_html)[0].split('>')[-1] except IndexError: industry = ' ' item['industry'] = industry pattern_organizer = re.compile( u'<div.*?class="title-detail">.*?主办单位(.*?)</div>', re.S) try: organizer = re.findall(pattern_organizer, each_meeting_html)[0].split('>')[-1] except IndexError: organizer = ' ' item['organizer'] = organizer pattern_site = re.compile(u'<li>.*?<b>网址(.*?)</li>', re.S) try: site = re.findall(pattern_site, each_meeting_html)[0].split('>')[-1] except IndexError: site = ' ' item['site'] = site print id, title, begin_time, end_time, venue, industry, organizer items.append(item) opera = DataOperator() opera.item_insert(data=items) return
def get_items(self, data): soup = BeautifulSoup(data) meetings_tmp = soup.findAll('ul', {'class': 'trade-news haiwai'})[:2] for tmp in meetings_tmp: items = [] items_history = [] meetings = tmp.findAll('li') for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': ' ' } base_info = meeting.text.split() url = meeting.a['href'] item['url'] = url id = url.split('_')[-1].split('.')[0] item['id'] = id begin_date = base_info[0] item['begin_date'] = begin_date item['end_date'] = begin_date industry = base_info[1].strip('】').strip('【') item['industry'] = industry city = base_info[2].strip('】').strip('【') item['city'] = city title = meeting.a['title'] item['title'] = title print url, begin_date, industry, city, title each_meeting_url = url each_meeting_data = self.get_data(each_meeting_url) if each_meeting_data: pattern_venue = re.compile( u'<ul>.*?展会场馆.*?<a.*?>(.*?)</a>', re.S) pattern_organizer = re.compile( u'<ul>.*?组织单位.*?<a.*?>(.*?)</a>', re.S) pattern_site = re.compile( u'<ul>.*?官方网站.*?</strong>(.*?)</li>', re.S) pattern_area = re.compile(u'<ul>.*?约.*?(\d+).*?平米.*?</li>', re.S) try: venue = re.findall(pattern_venue, each_meeting_data)[0] except IndexError: venue = ' ' item['venue'] = venue try: organizer = re.findall(pattern_organizer, each_meeting_data)[0] except IndexError: organizer = ' ' item['organizer'] = organizer try: site = re.findall(pattern_site, each_meeting_data)[0] except IndexError: site = ' ' item['site'] = site try: area = re.findall(pattern_area, each_meeting_data)[0] except IndexError: area = ' ' item['area'] = area print venue, organizer, site, area soup = BeautifulSoup(each_meeting_data) try: history_exhibitions = soup.findAll( 'table', {'class': 'tbsty exhtbl'})[0].select('tr')[1:] except IndexError: print '没有找到历届展会信息' history_info_tag = '0' else: print '找到历届展会信息' history_info_tag = '1' for history_exhibition in history_exhibitions: item_history = {} history_exhibition_info = history_exhibition.select( 'td')[1:4] history_exhibition_title = history_exhibition_info[ 0].a['title'].strip() history_exhibition_url = history_exhibition_info[ 0].a['href'] history_exhibition_id = history_exhibition_url.split( '_')[-1].split('.')[0] history_exhibition_date = history_exhibition_info[ 0].a.string print history_exhibition_title, history_exhibition_date, history_exhibition_url history_exhibition_venue = history_exhibition_info[ 1].a['title'].strip() print history_exhibition_venue history_exhibition_area_tmp = history_exhibition_info[ 2].string.strip() history_exhibition_area = filter( lambda x: x.isdigit(), history_exhibition_area_tmp) print history_exhibition_area item_history['sid'] = self.sid item_history['itemid'] = id item_history[ 'history_itemid'] = history_exhibition_id item_history['title'] = history_exhibition_title item_history['venue'] = history_exhibition_venue item_history['date'] = history_exhibition_date item_history['area'] = history_exhibition_area items_history.append(item_history) item['history_info_tag'] = history_info_tag items.append(item) opera = DataOperator() opera.item_insert(data=items, data_history=items_history)
def get_items(self, city_name, city_code, page='1' ): url = 'http://www.expo-china.com/web/exhi/exhi_search.aspx?City=%s&Industry=-1&Start=%sT%s&page=%s'%(city_code, self.start_date, self.end_date, page) print '正在抓取%s第%s页的数据'%(city_name, page), url data = self.get_data(url) if data: soup = BeautifulSoup(data) items = [] try: meetings=soup.findAll('div',{'class':'Resueltlist'})[0].findAll('li') except Exception as e: print('未找到展会数据,error:', e) return for meeting in meetings: item = {'sid':self.sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':city_name, 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'} title = meeting.select('div')[0].h3.a.string.strip() href = meeting.select('div')[0].h3.a['href'] id = href.split('-')[-1].split('.')[0] begin_date = meeting.select('div')[1].span.string.strip() print title, href, id, begin_date item['title'] = title item['id'] = id item['href'] = href item['begin_date'] = begin_date each_meeting_url = href each_meeting_data = self.get_data(each_meeting_url) if each_meeting_data: each_meeting_soup = BeautifulSoup(each_meeting_data) try: each_meeting_info = each_meeting_soup.findAll('div', {'div','zhanhuijieshao_c'})[0] except IndexError: print '未找到%s的具体信息'%title item['end_date'] = begin_date else: print '找到%s的具体信息'%title try: end_date = each_meeting_info.select('ul')[0].select('li')[0].text.split(u'至')[-1] except IndexError: end_date = begin_date item['end_date'] = end_date try: venue = each_meeting_info.select('ul')[0].select('li')[1].text.split(u':')[-1] except IndexError: venue = ' ' item['venue'] = venue try: organizer = each_meeting_info.select('div[class*="zhuban_danwei_big"]')[0].div.text.split(u':')[-1].strip() except IndexError: organizer = ' ' item['organizer'] = organizer print end_date, venue, organizer else: print '未找到%s的详细数据'%title item['end_date'] = begin_date #item['venue'] = ' ' #item['organizer'] = ' ' items.append(item) print '%s第%s页抓取完毕,准备写入'%(city_name, page) opera = DataOperator() opera.item_insert(data=items) try: next_url = soup.select('div[id="ctl00_MainPageHolder_webPage"]')[0].select('a')[-2]['href'] except KeyError: print '全部抓取完毕!' return else: next_page = next_url.split('=')[-1] print '找到%s第%s页的数据'%(city_name, next_page) self.get_items(city_name, city_code, next_page) else: print('未找到第{}页的数据'.format(page))
def get_items(self, begin_time): category = { '行业交流': 'conferlist', '商业展会': 'exhibition', '文艺赛事': 'literature', '活动聚会': 'event' } for key, value in category.iteritems(): print '正在爬取%s的数据' % key first_url = 'http://www.77huiyi.com/meet/%s/?mc=&msi=%s&msa=&page=%s' % ( value, begin_time, '1') print first_url try: data = self.get_data(first_url) pattern_numbers = re.compile(u'共(\d+)条') numbers = int(re.findall(pattern_numbers, data)[0]) pages = int(math.ceil(numbers / 20.0)) except Exception as e: print('error:', e) continue if pages == 0: print('在%s下没找到数据' % key) continue else: print('在%s下有%s条数据,共%s页' % (key, numbers, pages)) for page in range(1, int(pages) + 1): items = [] url = 'http://www.77huiyi.com/meet/%s/?mc=&msi=%s&msa=&page=%s' % ( value, begin_time, page) print '正在爬取第%s页的数据' % page print url data = self.get_data(url) if not data: print('未找到第%s页的数据' % page) continue soup = BeautifulSoup(data) #data显示正常,但是soup是乱码,暂时还没解决此问题 meetings = soup.findAll('ul', {'class': 'clearfix'})[1].findAll('li') for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } meeting_info = meeting.p url = meeting_info.a['href'] id = url.split('/')[-2] item['id'] = id item['url'] = url title = meeting_info.a.string item['title'] = title begin_date = meeting_info.select('span')[1].select( 'i')[0].string item['begin_date'] = begin_date city_tmp = meeting_info.select('span')[1].select( 'i')[1].string try: city = city_tmp.split()[-1] except IndexError: city = city_tmp item['city'] = city print url, title, begin_date, city each_meeting_url = url # each_meeting_data = urllib2.urlopen(each_meeting_url).read().decode('utf-8', 'ignore') try: each_meeting_data = requests.get(each_meeting_url).text each_meeting_soup = BeautifulSoup(each_meeting_data) each_meeting_info = each_meeting_soup.select( 'div[class*="conference-info"]')[0] except Exception as e: print('未找到%s的具体信息:%s' % (id, e)) item['end_date'] = begin_date else: end_date = each_meeting_info.select( 'span')[1].text.split('~')[-1].split()[0] item['end_date'] = end_date loc = each_meeting_info.select( 'span')[2].text.split()[-1] item['venue'] = loc print end_date, loc items.append(item) opera = DataOperator() opera.item_insert(data=items) return
def get_items(self): try: data = self.get_data(1) soup = BeautifulSoup(data) pages = int(soup.pagecount.string) except Exception as e: print('error:', e) return None print '共%s页' % pages for page in range(1, pages + 1): items = [] print '正在抓取第%s页的数据' % page data = self.get_data(page) if data: soup = BeautifulSoup(data) meetings = soup.findAll('meeting') print '当前为第%s页' % soup.pageno for meeting in meetings: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } id = meeting.meetingid.string item['id'] = id item['title'] = meeting.meetingtitle.string begin_date = meeting.meetingtime.string item['begin_date'] = begin_date city = meeting.meetingaddress.string item['city'] = city if meeting.subject.string: item['industry'] = meeting.subject.string each_meeting_url = 'http://www.meeting.edu.cn/meeting/meeting/notice/meetingAction-%s!detail.action' % id print each_meeting_url try: each_meeting_data = urllib2.urlopen( each_meeting_url).read() except Exception, e: print 'error:', e else: pattern = re.compile( '开始日期.*?<td.*?>(.*?)</td>.*?结束日期.*?<td.*?>(.*?)</td>.*?具体地点.*?<td.*?>(.*?)</td>.*?主办单位.*?<td.*?>(.*?)</td>.*?会议网站.*?<td.*?>.*?<a.*?>(.*?)</a>', re.S) each_meeting_item = re.findall(pattern, each_meeting_data)[0] print each_meeting_item end_date = each_meeting_item[1].strip() print end_date if not end_date: end_date = begin_date item['end_date'] = end_date location = each_meeting_item[2].strip() print location if not location: location = city item['venue'] = location organizer = each_meeting_item[3].strip() print organizer item['organizer'] = organizer site = each_meeting_item[4].strip() item['site'] = site print site items.append(item) print item print '第%s页抓取完毕' % page print '准备写入第%s页的数据' % page opera = DataOperator() opera.item_insert(data=items) else: print '未找到第%s页的数据' % page
def pre_spider(self): logger.info('爬虫开始前的操作') data_operator = DataOperator() data_operator.truncate_table('eventlist_official_temp') data_operator.from_official_to_temp_official() data_operator.truncate_table('eventlist_last') data_operator.from_current_to_last() data_operator.truncate_table('eventlist_current') data_operator.truncate_table('eventlist_unique') data_operator.mysql_close()
def get_data(self, page, city_name): pattern = re.compile( '<dd>.*?<a.*?(\d+).html".*?>(.*?)</a>.*?<p>.*?<a.*?>(.*?)</a>.*?<i.*?>(.*?)</i>.*?<p>.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?</dd>', re.S) items = [] url = 'http://zhanhui.3158.cn/zhxx/all/trade/%s/%s/' % (city_name, str(page)) print('正在抓取第%s页的数据' % page) print(url) html = self.get_html(url) if html: data = re.findall(pattern, html) for i in data: item = { 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } begin_date = i[3].split(' ')[0] end_date = i[3].split(' ')[-1] year = int(begin_date.split('-')[0]) month = int(begin_date.split('-')[1]) if year >= int(self.current_date[:4]) + 1 or year == int( self.current_date[:4]) and month >= int( self.current_date[-2:]): item['sid'] = self.sid item['begin_date'] = begin_date item['end_date'] = end_date id = i[0] item['id'] = id title = i[1] print(title) item['title'] = title industry = i[2] item['industry'] = industry city = i[4] item['city'] = city venue = i[5] item['venue'] = venue h2 = 'http://zhanhui.3158.cn/zhxx/n%s.html' % i[0] print(h2) html2 = self.get_html(h2) pattern_organizer = re.compile('主办单位:(.*?)</span>', re.S) organizer_tmp1 = re.findall(pattern_organizer, html2) if organizer_tmp1: try: organizer_tmp2 = organizer_tmp1[0].split( '>')[1].split('<')[0] except IndexError: org = re.split('、|\s', organizer_tmp1[0].strip())[0] else: org = re.split('、|\s', organizer_tmp2)[0] else: org = ' ' print(org) item['organizer'] = org items.append(item) if items: print('准备写入第%s页的数据' % page) opera = DataOperator() opera.item_insert(data=items) else: print('第%s页的数据全部过期,不会写入!' % page) else: print('未找到%s第%s页的数据' % (city_name, page))
def getItems(self, page): data = self.get_data(page=page) data = json.loads(data) if data: print '找到第%s页的数据' % page meetings = data['events'] items = [] if meetings: for meeting in meetings: if not meeting: print '全部爬取完毕' break item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } itemid = meeting['event_id'] title = meeting['event_name'] begin_date = meeting['event_begin_time'][:10] print itemid, title if ''.join(begin_date.split('-')[:2]) >= self.current_date: end_date = meeting['event_end_time'][:10] try: city = meeting['event_city_info'][0][ 'district_name'] except IndexError: city = '' try: venue = meeting['event_venue_info'][0].get('title') except IndexError: venue = city visitor = meeting.get('event_scale', ' ') if not visitor: visitor = '' try: organizer = meeting['event_sponsor'][0].get( 'ns_name', '') except IndexError: organizer = '' print itemid, title, city, venue, begin_date, end_date, visitor, organizer item['id'] = itemid item['title'] = title item['city'] = city item['venue'] = venue item['begin_date'] = begin_date item['end_date'] = end_date item['visitor'] = visitor items.append(item) else: print '第%s页ID为%s的数据过期' % (page, itemid) opera = DataOperator() opera.item_insert(data=items) return else: print '第%s页没有展会数据' % page return else: print '访问到%s页时被禁止,暂停6分钟后继续!' % (page) for i in tqdm(range(3600)): time.sleep(.1) self.getItems(page)
class HuoDongShu: def __init__(self, sid, current_date): self.url = 'http://www.huodongshu.com' self.current_date = current_date self.sid = sid def get_html(self, page, month): headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Host': 'www.huodongshu.com', 'Origin': 'http://www.huodongshu.com', 'Referer': 'http://www.huodongshu.com/html/find.html', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } params = { 'count': '10', 'type': '1', 'category_one': '222', 'category_two': 'all', 'city_name': '222', 'time_can': month + 3, 'page': page } url = self.url + '/event/getComEventListPcData.do' print url response = requests.post(url, headers=headers, params=params) data = response.json() if data['msg'] == 'ok': return data['data'] def get_items(self, month, page=1): data = self.get_html(page=page, month=month) pages, numbers = data['pageCount'], data['total'] #print '%月份共有%s页%s条数据,目前正在抓取第%s页的数据'%(pages, numbers, page) for page in range(1, int(pages) + 1): print '%s月份共有%s页%s条数据,目前正在抓取第%s页的数据' % (month, pages, numbers, page) data = self.get_html(page=page, month=month) events = data['list'] items = [] for event in events: item = { 'sid': self.sid, 'begin_date': ' ', 'end_date': ' ', 'id': ' ', 'title': ' ', 'industry': ' ', 'city': ' ', 'venue': ' ', 'organizer': ' ', 'site': ' ', 'visitor': ' ', 'area': ' ', 'history_info_tag': '0' } id = event.get('id') title = event.get('name') venue = event.get('place') try: city = event.get('city_name').rstrip('市') except Exception, e: print 'error:', e address = jieba.analyse.extract_tags( venue, allowPOS=['ns']) #对展会场馆分词保留地名 if address: city = address[0] else: city = '' begin_date = time.strftime( '%Y-%m-%d', time.localtime(float(event.get('start_time')))) end_date = time.strftime( '%Y-%m-%d', time.localtime(float(event.get('end_time')))) eachUrl = event.get('long_url') if eachUrl: eachData = requests.get(eachUrl).content try: visitor = BeautifulSoup(eachData).find( 'span', { 'data-id': 'dimensions' }).text.rstrip('人') except Exception, e: print 'error:', e visitor = '' item['city'] = city item['begin_date'] = begin_date item['end_date'] = end_date item['id'] = id item['title'] = title item['venue'] = venue item['visitor'] = visitor items.append(item) print id, title, city, begin_date, end_date, venue, visitor print '正在写入%s月份第%s页的数据' % (month, page) opera = DataOperator() opera.item_insert(data=items)