コード例 #1
0
class CrawlerDepth:
    def __init__(self, db_path):
        self.request_helper = RequestHelper()
        self.db_helper = DataBaseHelper(db_path)

    def get_detail_url(self, url):
        """
        从当前url网页中,通过正则匹配路线url
        :param url:
        :return:
        """
        detail_url_html = self.request_helper.send_request(url)

        # 途牛网
        tuniu_detail_url_id_regex = re.compile(r'http://www\.tuniu\.com/(?:tour|tours)/(?P<route_id>\d{9})')
        detail_url_id_ret = tuniu_detail_url_id_regex.findall(detail_url_html)
        detail_url_id_set = set(item for item in detail_url_id_ret)
        for item in detail_url_id_set:
            url = 'http://www.tuniu.com/tour/' + item
            self.db_helper.insert_into_routeurl(url)

        # 携程网
        # xiecheng_detail_url_regex = re.compile(r'<a href="(?P<route_url>http://vacations.ctrip.com/(?:grouptravel|freetravel)/p\w+\.html[\S\s]+?)"')
        # detail_url_ret = xiecheng_detail_url_regex.findall(detail_url_html)
        # detail_url_set = set(item for item in detail_url_ret)

    def crawler_depth(self, root_url, depth=CRAWLER_DEPTH):
        """
        爬虫函数,利用BeautifulSoup抓取网页中的所有链接,将这些链接添加到new_pages集合中
        一定深度循环结束之前,将new_page赋给pages,这一过程再次循环,知道depth结束
        :param root_url: 根站点 列表
        :param depth: 爬虫深度
        :return:
        """
        for k in range(depth):
            print '-----Depth: %d' % k
            new_url = set()
            for root_url_item in root_url:
                url_html = self.request_helper.send_request(root_url_item)
                if url_html == '':
                    continue
                else:
                    soup = BeautifulSoup(url_html, 'lxml')
                links = soup.find_all('a')  # 找到所有超链接标签
                for link in links:
                    if 'href' in dict(link.attrs):  # 获取link的属性字典
                        url = urljoin(root_url_item, link['href'])  # 从相对路径获取绝对路径, page+相对路径地址
                        if url.find("'") != -1:  # 存在不合法字符
                            continue
                        url = url.split('#')[0]  # 去掉位置部分
                        if url[0:4] == 'http':
                            new_url.add(url)
                        self.get_detail_url(url)  # 正则匹配当前 页面中所有符合路线详细信息url规则的链接
            root_url = new_url
コード例 #2
0
class CrawlerLastUrl:
    def __init__(self, db_path):
        self.db_helper = DataBaseHelper(db_path)

    def get_last_url(self):
        url_nums = self.db_helper.get_table_count('routeurl')
        total_url_id = [x + 1 for x in range(url_nums)]
        has_crawled_url_id = [
            x[0] for x in self.db_helper.select_all_data('routeinfo', 'urlid')
        ]
        last_url_id = list(set(total_url_id) ^ set(has_crawled_url_id))
        last_url_list = []
        for item in last_url_id:
            last_url_list.extend([
                x[0] for x in self.db_helper.select_one_data(
                    'url', 'routeurl', 'rowid', item)
            ])
        return last_url_list
コード例 #3
0
 def send_request(self, url):
     """
     发送请求,获取url页面html
     :param url:
     :return: 响应html
     """
     try:
         request = urllib2.Request(url, headers=self.header)
         response = urllib2.urlopen(request, timeout=self.timeout)  # 超时设置
         result = response.read()
         time.sleep(3)
         return result
     except Exception:
         print '---Request %s Error!---' % url
         if len(url) > 36:  # 评论请求出错,截取路线id
             route_id_regex = re.compile(
                 r'^http\S*?productId=(?P<route_id>\d{9})')
             route_id_ret = route_id_regex.search(url)
             url = 'http://www.tuniu.com/tour/' + route_id_ret.group(
                 'route_id')
         db_helper = DataBaseHelper('../Data/TravelInfo.db')  # 数据库路径写死了
         db_helper.insert_into_routeerrorurl(url)
         return ''
コード例 #4
0
 def __init__(self, db_path):
     self.request_helper = RequestHelper()
     self.db_helper = DataBaseHelper(db_path)
コード例 #5
0
class CrawlerData:
    def __init__(self, db_path):
        self.request_helper = RequestHelper()
        self.db_helper = DataBaseHelper(db_path)

    def get_route_info(self, url_list):
        """
        获取路线详细信息,插入数据库
        :param url_list:路线url列表
        :return:
        """
        for url in url_list:
            url_id = self.db_helper.is_exist('routeurl', 'url', url)  # 查询urllist中url对应的id
            print 'Information: %d ' % url_id
            detail_page_html = self.request_helper.send_request(url)
            if detail_page_html == '':
                continue

            soup = BeautifulSoup(detail_page_html, 'html5lib')
            # 路线详细信息
            route_info_dict = {'title': '', 'satisfaction': '0', 'summary': '', 'text': ''}
            # 路线标题
            html_parser = HTMLParser.HTMLParser()
            route_name = soup.title.get_text(strip=True) if soup.title is not None else '_'
            route_info_dict['title'] = html_parser.unescape(route_name).split('_')[0]
            # 总体评分
            tag_grade = soup.find('a', attrs={'class': 'resource-statisfaction-number'})
            route_info_dict['satisfaction'] = tag_grade.get_text(strip=True)[:-1] if tag_grade is not None else '0'
            # 行程概要
            tag_summary = soup.find('div', attrs={'class': 'resource-section-content-inner'})
            route_info_dict['summary'] = re.sub(r'\n+|\t+|\s+|\r', '', tag_summary.get_text(strip=True)) if tag_summary is not None else ''
            # 路线详细信息
            tag_detail = soup.find('div', attrs={'class': 'detail-sections'})
            route_info_dict['text'] = re.sub(r'\n+|\t+|\s+|\r', '', tag_detail.get_text(strip=True)) if tag_detail is not None else ''
            # 插入新数据到数据表 routeinfo
            route_info_dict = self.normalize_sql(route_info_dict)  # 规范化sql语句,剔除数据中可能存在的单引号
            self.db_helper.insert_into_routeinfo(url_id, route_info_dict['title'], int(route_info_dict['satisfaction']), route_info_dict['summary'], route_info_dict['text'])

            # 路线出发城市及价格信息
            self.get_route_departure(url_id, detail_page_html)

            # 路线评论数据
            self.get_route_comment(url_id, url)

    def normalize_sql(self, route_info):
        """
        规范化sql语句,防止插入数据中存在的单引号造成sql语法错误
        :param route_info: 字典
        :return:
        """
        for i in route_info:
            route_info[i] = route_info[i].replace('\'', '\"')
        return route_info

    def get_route_departure(self, urlid, page_html):
        """
        获取路线出发城市及价格信息,并插入到数据库表routedep
        :param urlid
        :param page_html:
        :return:
        """
        # 路线出发地及价格信息, 不同出发地,价格不同
        script_regex = re.compile(r'window\.pageData[\S\s]+?departCityInfo":(?P<route_departure>.*?),"backCityInfo',
                                  re.M)
        script_ret = script_regex.search(page_html)
        if script_ret is not None:
            route_departure_list = json.loads(script_ret.group('route_departure'))
            if route_departure_list is not None:
                for item in route_departure_list:
                    route_departure = item['name'] if item['name'] is not None else 'UNKNOWN'
                    route_price = int(item['price']) if item['price'] is not None else 0
                    self.db_helper.insert_into_routedep(urlid, route_departure, route_price)

    def get_route_comment(self, url_id, url):
        """
        请求路线评论json数据
        :param url_id
        :param url:
        :return:
        """
        route_id = url[-9:]
        route_comment = {'outline_comment': '', 'detail_comment': ''}
        outline_comment_url = 'http://www.tuniu.com/papi/product/remarkStatus?refresh=1&productId=' + route_id + '&productType=1'
        route_comment['outline_comment'] = self.request_helper.send_request(outline_comment_url)

        detail_1_comment_url = 'http://www.tuniu.com/papi/product/remarkList?refresh=1&productId=' + route_id + '&productType=1&page=1'
        page_1_comment = self.request_helper.send_request(detail_1_comment_url)
        if page_1_comment == '':
            self.db_helper.insert_into_routecom(url_id, route_comment['outline_comment'], '')
        else:
            comment_json = json.loads(page_1_comment)
            try:
                total_pages = comment_json['data']['totalPages']
            except:
                return
            for i in range(total_pages):
                detail_comment_url = 'http://www.tuniu.com/papi/product/remarkList?refresh=1&productId=' + route_id + '&productType=1&page=' + str(i+1)
                route_comment['detail_comment'] = self.request_helper.send_request(detail_comment_url)
                # comment_json = json.load(comment_page)
                route_comment = self.normalize_sql(route_comment)
                self.db_helper.insert_into_routecom(url_id, route_comment['outline_comment'], route_comment['detail_comment'])

    # def get_text_only(self, soup):
    #     """
    #     从一个HTML网页中获取文字(不带标签的),递归向下的方式获取网页中的文字,保留了文字出现的前后顺序
    #     :param soup: 含有标签的网页
    #     :return:网页中的文字
    #     """
    #     text = soup.string  # 只有一个子节点的时候,获取第该节点的内容,否则返回None
    #     if text is None:
    #         next_contents = soup.contents  # 返回该节点的子节点列表
    #         result_text = ''
    #         for content_item in next_contents:
    #             sub_text = self.get_text_only(content_item)
    #             result_text += sub_text + '\n'
    #         return result_text
    #     else:
    #         return text.strip()  # 移除字符串头尾指定的字符,默认为空格

    # def separte_words(self, text):
    #     """
    #     根据任何非空白字符进行分词处理,将字符串拆分成一组独立的单词
    #     :param text: 待拆分的字符串
    #     :return: 单词list
    #     """
    #     result_list = []
    #     splitter = re.compile(ur'[^a-zA-Z0-9_\u4e00-\u9fa5]')  # python2.7中要使用‘ur’匹配任意不是字母,数字,下划线,汉字的字符
    #     for s in splitter.split(text):  # 使用结巴分词,处理中文分词
    #         if s != '':
    #             result_list.extend(jieba.lcut(s.lower()))
    #     return result_list

    # def process_start(self, tasks):
    #     """
    #     启动进程,使用协程来执行
    #     :param tasks:
    #     :return:
    #     """
    #     gevent_task_list = []  # 存放协程任务
    #     for item in tasks:
    #         gevent_task_list.append(gevent.spawn(self.get_route_info, item))
    #     gevent.joinall(gevent_task_list)

    def crawl_data(self, route_url_list, process_url_num=PROCESS_URL_NUM):
        """
        循环遍历网页列表,针对每个网页调用add_to_index函数,添加索引。
        利用BeautifulSoup抓取网页中的数据
        :param route_url_list:路线url列表
        :param process_url_num: 每process_url_num条url创建一个进程
        :return:
        """
        url_count = 0  # 计数器,记录添加到协程队列的url数目
        task_list = []
        gevent_list = []
        for route_url_item in route_url_list:
            url_count += 1
            task_list.append(route_url_item)
            if url_count == process_url_num:
                # p = Process(target=self.process_start, args=(task_list,))
                # p.start()
                gevent_list.append(gevent.spawn(self.get_route_info, task_list))
                task_list = []  # 重置任务队列
                url_count = 0  # 重置计数器
        if len(task_list) != 0:  # 若退出循环后任务队列里还有url剩余
            # p = Process(target=self.process_start, args=(task_list,))  # 把剩余的url全都放到最后这个进程来执行
            # p.start()
            gevent_list.append(gevent.spawn(self.get_route_info, task_list))
        gevent.joinall(gevent_list)
コード例 #6
0
 def __init__(self, db_path):
     self.db_helper = DataBaseHelper(db_path)
コード例 #7
0
class CrawlerBreadth:
    def __init__(self, db_path):
        self.db_helper = DataBaseHelper(db_path)

    def get_html(self, url):
        """
        发送请求,获取url页面html
        :param url:
        :return: 响应html
        """

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
        }
        try:
            request = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(request, timeout=5)  # 超时设置
            result = response.read()
            time.sleep(3)
            return result
        except:
            print '----------------error------------------'
            return ''

    def get_subsite_url(self, root_url):
        """
        抓取途牛首页 全国各地分站点url, 形如:上海站:http://sh.tuniu.com
        :return:分站点url字典,形如:{上海:http://sh.tuniu.com}
        """
        stations_url_dict = {}  # 存储超链接字典,出发站名:出发站独立url
        soup = BeautifulSoup(self.get_html(root_url), 'lxml')
        tag_div_list = soup.find_all('div', attrs={"class":
                                                   "tagBox"})  # 搜索特定div标签内的超链接
        if tag_div_list is None:
            return stations_url_dict

        # 正则表达式匹配
        subsite_regex = re.compile(
            r'<a[\S\s]*?href="(?P<subsite_url>\S+?)"[\S\s]*?>(?P<subsite_name>\S+?)</a>'
        )
        subsite_ret = subsite_regex.findall(str(tag_div_list[0]))
        subsite_set = set(item for item in subsite_ret)
        for item in subsite_set:
            stations_url_dict[item[1]] = item[0]

        soup = BeautifulSoup(str(tag_div_list[0]), 'lxml')
        tag_a_list = soup.find_all('a', attrs={'href': True})
        tag_a_set = set(
            filter(
                lambda x: x['href'].startswith('http://') and x['href'].
                endswith('.tuniu.com'), tag_a_list))
        for item in tag_a_set:
            # self.db_helper.insert_into_subsite(item['href'], item.get_text(strip=True))
            stations_url_dict[item.get_text(strip=True)] = item['href']

        return stations_url_dict

    def get_catalog_url(self, stations_url):
        """
        抓取1个分站点首页的 旅游项目目录,如:上海周边游中的各个景点 如:普陀山:http://www.tuniu.com/guide/v-pts-8501/?pcat=5882
        :param stations_url:出发分站点url,如:http://sh.tuniu.com
        :return: 目的地的 名称 和 url 的字典,如:{普陀山:http://www.tuniu.com/guide/v-pts-8501/?pcat=5882}
        """
        soup = BeautifulSoup(self.get_html(stations_url), 'html5lib')
        tag_div_list = soup.find_all('div', attrs={"class": "catalog_third"})
        destinations_dict = {
        }  # 存储 目的地名称 和 url 的字典 如:普陀山:http://www.tuniu.com/guide/v-pts-8501/?pcat=5882
        if tag_div_list is None:
            return destinations_dict
        # bs4匹配
        soup = BeautifulSoup(str(tag_div_list[0]), 'html5lib')
        tag_a_list = soup.find_all('a', attrs={'href': True})
        destinations_url_set = set(
            filter(
                lambda x: x['href'].startswith('http://') and x.get_text(
                    strip=True) != '', tag_a_list))
        for item in destinations_url_set:
            destinations_dict[item.get_text(strip=True)] = item['href']

        return destinations_dict

    def get_details_url(self, route_list_url):
        """
        抓取列表页详细url,并存入数据库,如:http://www.tuniu.com/tour/210052165
        :param route_list_url: 路线列表页url,如http://www.tuniu.com/guide/v-pts-8501/?pcat=5882
        :return:
        """
        route_id_regex = re.compile(
            r'http://www\.tuniu\.com/(?:tour|tours)/(?P<route_id>\d{9})')
        route_id_ret = route_id_regex.findall(self.get_html(route_list_url))
        route_id_set = set(item for item in route_id_ret)

        for item in route_id_set:
            url = 'http://www.tuniu.com/tour/' + item
            self.db_helper.insert_into_routeurl(url)