예제 #1
0
 def crawler(self):
     hasUrl = True
     while hasUrl:
         target = self.getTarget()
         # 判断是否已经取完目标
         if target != "":
             # 预登录处理
             self.crawler_run(target)
         else:
             hasUrl = False
     baseUtil.log("Crawler is over!!!!!!!!!!!!!")
예제 #2
0
    def analysisPageInfo(self, target, url):
        info_rule = target.get("data_rule").get("info_rule")
        baseUtil.log("Load url")
        try:
            baseUtil.driver.get(url)
        except Exception:
            baseUtil.log("Load Time out,will return!")
            return None
        baseUtil.log("Loaded url")
        time.sleep(6)
        # 评论采集-数据加载  START
        comment_rule = target.get("comment_rule")
        if int(comment_rule.get("need_comment")) == 1:
            if int(comment_rule.get("comment_page_type")) == 2:
                for i in xrange(int(comment_rule.get("max_comment_page"))):
                    # 判断 当有按钮时  点击按钮 没有按钮时 滚动
                    try:
                        if comment_rule.get('comment_page_key') and len(
                                baseUtil.driver.find_elements_by_css_selector(
                                    comment_rule.get(
                                        "comment_page_key"))) == 0:
                            js = "var p = document.body.scrollTop=100000"
                            baseUtil.driver.execute_script(js)
                            baseUtil.log("Auth Comment scrollTop")
                        else:
                            baseUtil.driver.find_element_by_css_selector(
                                comment_rule.get("comment_page_key")).click()
                            baseUtil.log("Click Comment scrollTop")
                    except:
                        baseUtil.log("No found Click target for " +
                                     comment_rule.get('comment_page_key'))
                    time.sleep(2)
        else:
            baseUtil.log("Skip Commont!")
        # 评论采集-数据加载  END
        html = baseUtil.driver.page_source
        doc = pq(html)
        save_data = {}
        save_data['href'] = url
        save_data['crawler_time'] = int(time.time())
        save_data['site_name'] = target.get('target_name')
        save_data['site_id'] = target.get('id')
        save_data['source_type'] = target.get('source_type')
        # 数据采集
        for k, v in info_rule.items():
            k_v = None
            k_v_find = doc.find(v.get("find"))
            if v.get("getType") == 'text':
                k_v = k_v_find.text()
            elif v.get("getType") == 'attr':
                if v.get("attr_key"):
                    k_v = k_v_find.attr(v.get("attr_key"))
                else:
                    baseUtil.log("[ERROR]:data rule is error  target is :" +
                                 target.get("target_url"))
            elif v.get("getType") == 'html':
                k_v = k_v_find.html()

            if v.get("formart") == 'time_0' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M"))
            elif v.get("formart") == 'time_1' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M:%S"))
            elif v.get("formart") == 'time_2' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d"))
            elif v.get("formart") == 'time_3' and k_v != '':
                k_v = int(k_v) / 1000
            elif v.get("formart") == 'time_4' and k_v != '':
                k_v = k_v.replace('年', '-')
                k_v = k_v.replace('月', '-')
                k_v = k_v.replace('日', '')
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M:%S"))
            elif v.get("formart") == 'time_5' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y/%m/%d %H:%M"))
            elif v.get("formart") == 'time_6' and k_v != '':
                k_v = k_v.replace('年', '-')
                k_v = k_v.replace('月', '-')
                k_v = k_v.replace('日', '')
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d%H:%M"))
            elif v.get("formart") == 'str_int' and k_v != '':
                k_v = filter(lambda ch: ch in '0123456789.', k_v)
            elif v.get("formart") == 'formart_url' and k_v != '':
                k_v = baseUtil.formartURL(target.get("target_url"), k_v)
            save_data[k] = k_v
        # 评论数据采集
        save_data['comments'] = []
        if int(comment_rule.get("need_comment")) == 1:
            comment_lines = doc.find(comment_rule.get('comment_line'))
            for comment_line in comment_lines.items():
                comment_line_data = {}
                for k, comment in comment_rule.get("info_rule").items():
                    if k and comment.get("find"):
                        comment_find = comment_line.find(comment.get("find"))
                        c_v = None
                        if comment.get("getType") == 'text':
                            c_v = comment_find.text()
                        elif comment.get("getType") == 'attr':
                            if comment.get("attr_key"):
                                c_v = comment_find.attr(
                                    comment.get("attr_key"))
                            else:
                                baseUtil.log(
                                    "[ERROR]:Comment rule is error  target is :"
                                    + target.get("target_url"))
                        elif comment.get("getType") == 'html':
                            c_v = comment_find.html()

                        if comment.get(
                                "formart") == 'formart_url' and c_v != '':
                            c_v = baseUtil.formartURL(target.get("target_url"),
                                                      c_v)
                        comment_line_data[k] = c_v
                comment_line_data['crawler_time'] = time.time()
                save_data['comments'].append(comment_line_data)

        if save_data['content']:
            save_data['create_time'] = save_data.get('create_time') and int(
                save_data.get('create_time')) or int(time.time())
            baseUtil.saveData(target, save_data)
예제 #3
0
 def getAnalysisHtml(self, target):
     baseUtil.log("Analysis begin…………")
     info_urls = []
     # 分页类型 2||3 为滚动加载  加载完分页数据之后再抽取页面源代码
     if int(target.get("page_type")) == 1 or int(
             target.get("page_type")) == 2:
         url_params_str = baseUtil.formartParamsToUrl(
             target.get('url_params'))
         print target.get("target_url")
         baseUtil.driver.get(target.get("target_url") + url_params_str)
         time.sleep(16)
         baseUtil.log("Get html…………")
         try:
             for i in xrange(target.get('max_crawler_page')):
                 #分页类型2 不会有加载更多的点击按钮 所以直接滚动
                 if int(target.get("page_type")) == 1:
                     js = "var p = document.body.scrollTop=100000"
                     baseUtil.driver.execute_script(js)
                     baseUtil.log("Auth scrollTop")
                 else:
                     # 判断 当有按钮时  点击按钮 没有按钮时 滚动
                     if len(
                             baseUtil.driver.find_elements_by_css_selector(
                                 target.get("page_params_key"))) == 0:
                         js = "var p = document.body.scrollTop=100000"
                         baseUtil.driver.execute_script(js)
                         baseUtil.log("Auth scrollTop")
                     else:
                         baseUtil.driver.find_element_by_css_selector(
                             target.get("page_params_key")).click()
                         baseUtil.log("Click scrollTop")
                 time.sleep(6)
         except:
             baseUtil.log('[ERROR] not found scroll selector')
         #获取加载分页后的源代码
         html = baseUtil.driver.page_source
         #print html
         doc = pq(html)
         if target.get('data_rule').get(
                 'get_info_address_type') == 'tag_href':
             datas_href_items = doc.find(
                 target.get('data_rule').get('get_info_address_value'))
             for data_href in datas_href_items.items():
                 info_urls.append(data_href.attr("href"))
     # 分页类型为1 需要循环加载分页页面 并单独抽取源代码
     elif int(target.get("page_type")) == 0:
         url_params = target.get('url_params')
         for i in xrange(target.get('max_crawler_page')):
             url_params[target.get('page_params_key')] = i
             url_params_str = baseUtil.formartParamsToUrl(url_params)
             baseUtil.driver.get(target.get("target_url") + url_params_str)
             time.sleep(6)
             baseUtil.log("Get html…………")
             #获取加载分页后的源代码
             html = baseUtil.driver.page_source
             doc = pq(html)
             if target.get('data_rule').get(
                     'get_info_address_type') == 'tag_href':
                 datas_href_items = doc.find(
                     target.get('data_rule').get('get_info_address_value'))
                 for data_href in datas_href_items.items():
                     info_urls.append(data_href.attr("href"))
     for i in xrange(0, len(info_urls)):
         info_url = info_urls[i]
         host = baseUtil.getHostInURL(target.get("target_url"))
         if "http" in info_url:
             info_url_host = baseUtil.getHostInURL(info_url)
             if info_url_host.find(info_url_host) == -1:
                 continue
         elif info_url[:2] == '//':
             info_url = "http:" + "" + info_url
         elif info_url[:1] == '/':
             info_url = "http://" + host + "" + info_url
         else:
             info_url = "http://" + "" + info_url
         baseUtil.log("target_url:" + info_url)
         if baseUtil.checkRepeat(info_url):
             baseUtil.log("Pass checkRepeat")
             self.analysisPageInfo(target, info_url)
             time.sleep(2)