class CrawlerDepth: def __init__(self, db_path): self.request_helper = RequestHelper() self.db_helper = DataBaseHelper(db_path) def get_detail_url(self, url): """ 从当前url网页中,通过正则匹配路线url :param url: :return: """ detail_url_html = self.request_helper.send_request(url) # 途牛网 tuniu_detail_url_id_regex = re.compile(r'http://www\.tuniu\.com/(?:tour|tours)/(?P<route_id>\d{9})') detail_url_id_ret = tuniu_detail_url_id_regex.findall(detail_url_html) detail_url_id_set = set(item for item in detail_url_id_ret) for item in detail_url_id_set: url = 'http://www.tuniu.com/tour/' + item self.db_helper.insert_into_routeurl(url) # 携程网 # xiecheng_detail_url_regex = re.compile(r'<a href="(?P<route_url>http://vacations.ctrip.com/(?:grouptravel|freetravel)/p\w+\.html[\S\s]+?)"') # detail_url_ret = xiecheng_detail_url_regex.findall(detail_url_html) # detail_url_set = set(item for item in detail_url_ret) def crawler_depth(self, root_url, depth=CRAWLER_DEPTH): """ 爬虫函数,利用BeautifulSoup抓取网页中的所有链接,将这些链接添加到new_pages集合中 一定深度循环结束之前,将new_page赋给pages,这一过程再次循环,知道depth结束 :param root_url: 根站点 列表 :param depth: 爬虫深度 :return: """ for k in range(depth): print '-----Depth: %d' % k new_url = set() for root_url_item in root_url: url_html = self.request_helper.send_request(root_url_item) if url_html == '': continue else: soup = BeautifulSoup(url_html, 'lxml') links = soup.find_all('a') # 找到所有超链接标签 for link in links: if 'href' in dict(link.attrs): # 获取link的属性字典 url = urljoin(root_url_item, link['href']) # 从相对路径获取绝对路径, page+相对路径地址 if url.find("'") != -1: # 存在不合法字符 continue url = url.split('#')[0] # 去掉位置部分 if url[0:4] == 'http': new_url.add(url) self.get_detail_url(url) # 正则匹配当前 页面中所有符合路线详细信息url规则的链接 root_url = new_url
class CrawlerLastUrl: def __init__(self, db_path): self.db_helper = DataBaseHelper(db_path) def get_last_url(self): url_nums = self.db_helper.get_table_count('routeurl') total_url_id = [x + 1 for x in range(url_nums)] has_crawled_url_id = [ x[0] for x in self.db_helper.select_all_data('routeinfo', 'urlid') ] last_url_id = list(set(total_url_id) ^ set(has_crawled_url_id)) last_url_list = [] for item in last_url_id: last_url_list.extend([ x[0] for x in self.db_helper.select_one_data( 'url', 'routeurl', 'rowid', item) ]) return last_url_list
def send_request(self, url): """ 发送请求,获取url页面html :param url: :return: 响应html """ try: request = urllib2.Request(url, headers=self.header) response = urllib2.urlopen(request, timeout=self.timeout) # 超时设置 result = response.read() time.sleep(3) return result except Exception: print '---Request %s Error!---' % url if len(url) > 36: # 评论请求出错,截取路线id route_id_regex = re.compile( r'^http\S*?productId=(?P<route_id>\d{9})') route_id_ret = route_id_regex.search(url) url = 'http://www.tuniu.com/tour/' + route_id_ret.group( 'route_id') db_helper = DataBaseHelper('../Data/TravelInfo.db') # 数据库路径写死了 db_helper.insert_into_routeerrorurl(url) return ''
def __init__(self, db_path): self.request_helper = RequestHelper() self.db_helper = DataBaseHelper(db_path)
class CrawlerData: def __init__(self, db_path): self.request_helper = RequestHelper() self.db_helper = DataBaseHelper(db_path) def get_route_info(self, url_list): """ 获取路线详细信息,插入数据库 :param url_list:路线url列表 :return: """ for url in url_list: url_id = self.db_helper.is_exist('routeurl', 'url', url) # 查询urllist中url对应的id print 'Information: %d ' % url_id detail_page_html = self.request_helper.send_request(url) if detail_page_html == '': continue soup = BeautifulSoup(detail_page_html, 'html5lib') # 路线详细信息 route_info_dict = {'title': '', 'satisfaction': '0', 'summary': '', 'text': ''} # 路线标题 html_parser = HTMLParser.HTMLParser() route_name = soup.title.get_text(strip=True) if soup.title is not None else '_' route_info_dict['title'] = html_parser.unescape(route_name).split('_')[0] # 总体评分 tag_grade = soup.find('a', attrs={'class': 'resource-statisfaction-number'}) route_info_dict['satisfaction'] = tag_grade.get_text(strip=True)[:-1] if tag_grade is not None else '0' # 行程概要 tag_summary = soup.find('div', attrs={'class': 'resource-section-content-inner'}) route_info_dict['summary'] = re.sub(r'\n+|\t+|\s+|\r', '', tag_summary.get_text(strip=True)) if tag_summary is not None else '' # 路线详细信息 tag_detail = soup.find('div', attrs={'class': 'detail-sections'}) route_info_dict['text'] = re.sub(r'\n+|\t+|\s+|\r', '', tag_detail.get_text(strip=True)) if tag_detail is not None else '' # 插入新数据到数据表 routeinfo route_info_dict = self.normalize_sql(route_info_dict) # 规范化sql语句,剔除数据中可能存在的单引号 self.db_helper.insert_into_routeinfo(url_id, route_info_dict['title'], int(route_info_dict['satisfaction']), route_info_dict['summary'], route_info_dict['text']) # 路线出发城市及价格信息 self.get_route_departure(url_id, detail_page_html) # 路线评论数据 self.get_route_comment(url_id, url) def normalize_sql(self, route_info): """ 规范化sql语句,防止插入数据中存在的单引号造成sql语法错误 :param route_info: 字典 :return: """ for i in route_info: route_info[i] = route_info[i].replace('\'', '\"') return route_info def get_route_departure(self, urlid, page_html): """ 获取路线出发城市及价格信息,并插入到数据库表routedep :param urlid :param page_html: :return: """ # 路线出发地及价格信息, 不同出发地,价格不同 script_regex = re.compile(r'window\.pageData[\S\s]+?departCityInfo":(?P<route_departure>.*?),"backCityInfo', re.M) script_ret = script_regex.search(page_html) if script_ret is not None: route_departure_list = json.loads(script_ret.group('route_departure')) if route_departure_list is not None: for item in route_departure_list: route_departure = item['name'] if item['name'] is not None else 'UNKNOWN' route_price = int(item['price']) if item['price'] is not None else 0 self.db_helper.insert_into_routedep(urlid, route_departure, route_price) def get_route_comment(self, url_id, url): """ 请求路线评论json数据 :param url_id :param url: :return: """ route_id = url[-9:] route_comment = {'outline_comment': '', 'detail_comment': ''} outline_comment_url = 'http://www.tuniu.com/papi/product/remarkStatus?refresh=1&productId=' + route_id + '&productType=1' route_comment['outline_comment'] = self.request_helper.send_request(outline_comment_url) detail_1_comment_url = 'http://www.tuniu.com/papi/product/remarkList?refresh=1&productId=' + route_id + '&productType=1&page=1' page_1_comment = self.request_helper.send_request(detail_1_comment_url) if page_1_comment == '': self.db_helper.insert_into_routecom(url_id, route_comment['outline_comment'], '') else: comment_json = json.loads(page_1_comment) try: total_pages = comment_json['data']['totalPages'] except: return for i in range(total_pages): detail_comment_url = 'http://www.tuniu.com/papi/product/remarkList?refresh=1&productId=' + route_id + '&productType=1&page=' + str(i+1) route_comment['detail_comment'] = self.request_helper.send_request(detail_comment_url) # comment_json = json.load(comment_page) route_comment = self.normalize_sql(route_comment) self.db_helper.insert_into_routecom(url_id, route_comment['outline_comment'], route_comment['detail_comment']) # def get_text_only(self, soup): # """ # 从一个HTML网页中获取文字(不带标签的),递归向下的方式获取网页中的文字,保留了文字出现的前后顺序 # :param soup: 含有标签的网页 # :return:网页中的文字 # """ # text = soup.string # 只有一个子节点的时候,获取第该节点的内容,否则返回None # if text is None: # next_contents = soup.contents # 返回该节点的子节点列表 # result_text = '' # for content_item in next_contents: # sub_text = self.get_text_only(content_item) # result_text += sub_text + '\n' # return result_text # else: # return text.strip() # 移除字符串头尾指定的字符,默认为空格 # def separte_words(self, text): # """ # 根据任何非空白字符进行分词处理,将字符串拆分成一组独立的单词 # :param text: 待拆分的字符串 # :return: 单词list # """ # result_list = [] # splitter = re.compile(ur'[^a-zA-Z0-9_\u4e00-\u9fa5]') # python2.7中要使用‘ur’匹配任意不是字母,数字,下划线,汉字的字符 # for s in splitter.split(text): # 使用结巴分词,处理中文分词 # if s != '': # result_list.extend(jieba.lcut(s.lower())) # return result_list # def process_start(self, tasks): # """ # 启动进程,使用协程来执行 # :param tasks: # :return: # """ # gevent_task_list = [] # 存放协程任务 # for item in tasks: # gevent_task_list.append(gevent.spawn(self.get_route_info, item)) # gevent.joinall(gevent_task_list) def crawl_data(self, route_url_list, process_url_num=PROCESS_URL_NUM): """ 循环遍历网页列表,针对每个网页调用add_to_index函数,添加索引。 利用BeautifulSoup抓取网页中的数据 :param route_url_list:路线url列表 :param process_url_num: 每process_url_num条url创建一个进程 :return: """ url_count = 0 # 计数器,记录添加到协程队列的url数目 task_list = [] gevent_list = [] for route_url_item in route_url_list: url_count += 1 task_list.append(route_url_item) if url_count == process_url_num: # p = Process(target=self.process_start, args=(task_list,)) # p.start() gevent_list.append(gevent.spawn(self.get_route_info, task_list)) task_list = [] # 重置任务队列 url_count = 0 # 重置计数器 if len(task_list) != 0: # 若退出循环后任务队列里还有url剩余 # p = Process(target=self.process_start, args=(task_list,)) # 把剩余的url全都放到最后这个进程来执行 # p.start() gevent_list.append(gevent.spawn(self.get_route_info, task_list)) gevent.joinall(gevent_list)
def __init__(self, db_path): self.db_helper = DataBaseHelper(db_path)
class CrawlerBreadth: def __init__(self, db_path): self.db_helper = DataBaseHelper(db_path) def get_html(self, url): """ 发送请求,获取url页面html :param url: :return: 响应html """ headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' } try: request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request, timeout=5) # 超时设置 result = response.read() time.sleep(3) return result except: print '----------------error------------------' return '' def get_subsite_url(self, root_url): """ 抓取途牛首页 全国各地分站点url, 形如:上海站:http://sh.tuniu.com :return:分站点url字典,形如:{上海:http://sh.tuniu.com} """ stations_url_dict = {} # 存储超链接字典,出发站名:出发站独立url soup = BeautifulSoup(self.get_html(root_url), 'lxml') tag_div_list = soup.find_all('div', attrs={"class": "tagBox"}) # 搜索特定div标签内的超链接 if tag_div_list is None: return stations_url_dict # 正则表达式匹配 subsite_regex = re.compile( r'<a[\S\s]*?href="(?P<subsite_url>\S+?)"[\S\s]*?>(?P<subsite_name>\S+?)</a>' ) subsite_ret = subsite_regex.findall(str(tag_div_list[0])) subsite_set = set(item for item in subsite_ret) for item in subsite_set: stations_url_dict[item[1]] = item[0] soup = BeautifulSoup(str(tag_div_list[0]), 'lxml') tag_a_list = soup.find_all('a', attrs={'href': True}) tag_a_set = set( filter( lambda x: x['href'].startswith('http://') and x['href']. endswith('.tuniu.com'), tag_a_list)) for item in tag_a_set: # self.db_helper.insert_into_subsite(item['href'], item.get_text(strip=True)) stations_url_dict[item.get_text(strip=True)] = item['href'] return stations_url_dict def get_catalog_url(self, stations_url): """ 抓取1个分站点首页的 旅游项目目录,如:上海周边游中的各个景点 如:普陀山:http://www.tuniu.com/guide/v-pts-8501/?pcat=5882 :param stations_url:出发分站点url,如:http://sh.tuniu.com :return: 目的地的 名称 和 url 的字典,如:{普陀山:http://www.tuniu.com/guide/v-pts-8501/?pcat=5882} """ soup = BeautifulSoup(self.get_html(stations_url), 'html5lib') tag_div_list = soup.find_all('div', attrs={"class": "catalog_third"}) destinations_dict = { } # 存储 目的地名称 和 url 的字典 如:普陀山:http://www.tuniu.com/guide/v-pts-8501/?pcat=5882 if tag_div_list is None: return destinations_dict # bs4匹配 soup = BeautifulSoup(str(tag_div_list[0]), 'html5lib') tag_a_list = soup.find_all('a', attrs={'href': True}) destinations_url_set = set( filter( lambda x: x['href'].startswith('http://') and x.get_text( strip=True) != '', tag_a_list)) for item in destinations_url_set: destinations_dict[item.get_text(strip=True)] = item['href'] return destinations_dict def get_details_url(self, route_list_url): """ 抓取列表页详细url,并存入数据库,如:http://www.tuniu.com/tour/210052165 :param route_list_url: 路线列表页url,如http://www.tuniu.com/guide/v-pts-8501/?pcat=5882 :return: """ route_id_regex = re.compile( r'http://www\.tuniu\.com/(?:tour|tours)/(?P<route_id>\d{9})') route_id_ret = route_id_regex.findall(self.get_html(route_list_url)) route_id_set = set(item for item in route_id_ret) for item in route_id_set: url = 'http://www.tuniu.com/tour/' + item self.db_helper.insert_into_routeurl(url)