Ejemplo n.º 1
0
 def start_one_site_tasks(self, site):
     site_id = get_md5_i64(site)
     self.log.info("start on site:%s, site_id:%s" % (site, site_id))
     site_scheduler = self.site_schedulers.get(site_id, None)
     if not site_scheduler:
         self.log.info("当前需要加载站点信息: site = {}".format(site))
         sites = self.loader.load_sites(site)
         self.log.info("加载站点信息完成: site = {}".format(site))
         if sites and len(sites) > 0:
             site_info = sites[site]
             site_id = int(site_info['site_id'])
             site_scheduler = SiteScheduler(site_info,
                                            self.conf['redis_tasks'],
                                            self.conf['log'],
                                            self.site_statistic,
                                            self.seed_statistic)
             self.site_schedulers[site_id] = site_scheduler
     else:
         try:
             self.log.info("当前需要加载站点信息: site = {}".format(site))
             sites = self.loader.load_sites(site)
             self.log.info("加载站点信息完成: site = {}".format(site))
             if sites and len(sites) > 0 and sites.has_key(site):
                 site_info = sites[site]
                 site_scheduler.reload_site(site_info)
         except Exception, e:
             self.log.error("load sites info fail:%s" %
                            traceback.format_exc())
             return False
Ejemplo n.º 2
0
    def _extract_links(self, selector, base_url, response_encoding='utf-8', links =[]):
        # hacky way to get the underlying lxml parsed document
        links_set = {}
        for link in links:
            links_set[link.url] = link
        for el, attr, attr_val in self._iter_links(selector):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                attr_val = urljoin("", attr_val)
            except ValueError:
                continue # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue

            if url.startswith("javascript"):continue #skip javascript
            # to fix relative links after process_value
            url = urljoin(base_url, url)
            url = url.split('#')[0]
            anchor = _collect_string_content(el) or ''
            if isinstance(anchor, unicode):
                anchor = anchor.encode('utf-8')
            anchor = anchor.replace('\r', '').replace('\n', '').replace('\t', " ").strip()
            link = Link(url=url, url_id=get_md5_i64(url), anchor=anchor,
                        )
            links_set[link.url] = link
            #links.append(link)
        return links_set.values()
Ejemplo n.º 3
0
 def load_sites(self, site=''):
     sites = {}
     if site:
         sql_str = 'SELECT site, name, avg_interval, encoding FROM `site` WHERE site="' \
                   + str(site) + '" ORDER BY `id`'
     else:
         sql_str = 'SELECT site, name, avg_interval, encoding FROM `site`  ORDER BY `id`'
     datas = mysql_fetch(self.mysql_conn, sql_str)
     for data in datas:
         task = {}
         site = data[0]
         task['site'] = site
         task['name'] = data[1]
         task['avg_interval'] = data[2]
         task['encoding'] = data[3]
         task['site_id'] = get_md5_i64(site)
         task['cookies'] = []
         cookie_sql = 'SELECT site, user_id, cookie FROM `cookie` where site ="%s" ORDER BY `id`' % site
         # cookies = self.fetch_mysql(cookie_sql)
         cookies = mysql_fetch(self.mysql_conn, cookie_sql)
         for cookie in cookies:
             cookie_dict = {}
             cookie_dict['user_id'] = cookie[1]
             cookie_dict['cookie'] = cookie[2].encode("utf8")
             task['cookies'].append(cookie_dict)
         sites[site] = task
     self.log.info("sites:" + str(len(sites)))
     return sites
Ejemplo n.º 4
0
 def pack_extract_info(self, extract_info, page_data, download_rsp,
                       parser_config):
     """
     :param ExtractData:
     :param DownloadRsp:
     :return ExtractInfo:
     """
     extract_data = page_data.get('data', {})
     common_data = page_data.get('common_data')
     links = page_data.get('links')
     extract_info.ex_status = ExStatus.kEsNotExtract  # default not extract
     extract_info.redirect_url = download_rsp.redirect_url
     extract_info.content_time = common_data.get('public_time')
     extract_info.topic_id = parser_config.topic_id
     extract_info.html_tag_title = common_data.get('title')
     extract_info.page_text = common_data.get('content')
     extract_info.content_language = common_data.get('lang')
     extract_info.content_finger = common_data.get('content_finger')
     if not extract_info.content_finger and extract_info.page_text:
         extract_info.content_finger = tools.get_md5_i64(
             extract_info.page_text)
     extract_info.links = links
     extract_info.extract_data = json.dumps(extract_data)
     self.log.info("url:%s\textract_data_length:%d" %
                   (str(download_rsp.url), len(extract_info.extract_data)))
     return extract_info
Ejemplo n.º 5
0
    def _get_crawl_history(self, extract_info, crawl_info):
        crawl_historys = []
        crawl_history = CrawlHistory()
        try:
            crawl_history.download_time = crawl_info.download_time
            crawl_history.content_finger = extract_info.content_finger
            crawl_history.link_finger = extract_info.link_finger
            crawl_history.real_title = extract_info.html_tag_title
            crawl_history.analyse_title = extract_info.analyse_title
            crawl_history.content_time = extract_info.content_time
            crawl_history.page_type = extract_info.content_type

            crawl_history.status_code = crawl_info.status_code
            crawl_history.http_code = crawl_info.http_code
            crawl_history.page_size = crawl_info.page_size

            self.inner_links.sort(
                key=lambda inner_link: get_md5_i64(inner_link.url),
                reverse=True)
            inner_links_str = ','.join(
                map(lambda link: link.url, self.inner_links))
            crawl_history.inner_link_finger = get_md5_i64(inner_links_str)
            crawl_history.inner_links_num = len(self.inner_links)
            crawl_history.outer_links_num = len(self.outer_links)

            # todo 以下待补充
            crawl_history.new_links_num_for_self = 0
            crawl_history.good_links_num_for_for_self = 0
            crawl_history.new_links_num_for_all = 0
            crawl_history.good_links_num_for_all = 0
            crawl_history.dead_page_type = 0
            crawl_history.dead_page_time = 0
        except Exception as e:
            self.logger.warning(e.message)
            pass
        crawl_historys.append(crawl_history)
        try:
            old_history_link = self.history_links_cache.get(
                self.url_info.get('url'))
            if old_history_link and old_history_link.get('link_attr'):
                crawl_historys.extend(
                    old_history_link['link_attr'].normal_crawl_his)
                crawl_historys = crawl_historys[:7]
        except Exception as e:
            #发生异常应该是没有历史了吧
            pass
        return crawl_historys
Ejemplo n.º 6
0
    def restart_seed(self, seed_id, site):
        self.log.info('restart seed, seed_id:%s, site:%s' % (seed_id, site))
        site_id = get_md5_i64(site)
        seeds = self.loader.load_seed_by_id(seed_id, True)
        site_scheduler = self.site_schedulers.get(site_id, None)
        if not site_scheduler:
            return True

        site_scheduler.add_seed_json(seeds)
        return True
Ejemplo n.º 7
0
    def clear_one_site_cache(self, site):
        site_id = get_md5_i64(site)
        site_scheduler = self.site_schedulers.get(site_id, None)
        if not site_scheduler:
            self.log.warning(
                "clear on site cache fail, site:%s, site_id:%s, site_scheduler_key:%s"
                % (site, site_id, self.site_schedulers.keys()))
            return True

        try:
            site_scheduler.clear_one_site_cache()
        except Exception, e:
            self.log.error("clear site:%s cache error: %s" %
                           (site, traceback.format_exc()))
            return False
Ejemplo n.º 8
0
    def stop_one_site_tasks(self, site):
        self.log.info('stop site:%s' % site)
        site_id = get_md5_i64(site)
        site_scheduler = self.site_schedulers.get(site_id, None)
        if not site_scheduler:
            self.log.info("stop_one_site_tasks\tsite:%s\t not_exist" % site)
            return False

        try:
            self.running_site_schedulers.pop(site_id)
        except KeyError, e:
            self.log.warning(
                "site_id:%s not in self.running_site_schedulers:%s" %
                (site_id, self.running_site_schedulers))
            pass
Ejemplo n.º 9
0
 def process_request(self, request):
     try:
         site_id = request.url.split('/')[2]
     except:
         site_id = request.get('url')
     site_id = get_md5_i64(site_id)
     try:
         site_id = int(site_id)
     except:
         site_id = None
     cookies = get_cookie(site_id)
     if not cookies:
         return
     cookie = cookies[random.randint(0, sys.maxint) % len(cookies)]
     self.logger.info(" using cookie site_id: %s user_id: %s" %
                      (site_id, cookie['user_id']))
     request.info.setdefault('headers', {})['Cookie'] = cookie['cookie']
Ejemplo n.º 10
0
 def _extract_common_data(self, xdoc):
     result = {}
     try:
         result = SFExtractor().extract(etree2string(xdoc, pretty_print=True).decode(encoding='utf-8'))
     except Exception:
         pass
     lang = xdoc.xpath('string(//html/@lang)')
     if isinstance(lang, unicode):
         lang = lang.encode('utf-8')
     if lang == '':  # default zh-ch
         lang = 'zh-ch'
     result['lang'] = lang
     result['content'] = result.get('content', '').encode('utf-8')
     result['title'] = result.get('title', '').encode('utf-8')
     result['content_finger'] = get_md5_i64(result.get('content', ''))
     try:
         result['public_time'] = parser_tool.string2timestamp(result.get('public_time', ''))
         if result['public_time'] > 2147483648 or result['public_time'] < -2147483648:
             result['public_time'] = None
     except Exception as e:
         result['public_time'] = None
     return result
Ejemplo n.º 11
0
    def _extract_single(self, x, r):
        result = ""
        value_type = r.get("$value_type")
        parse_method = r.get("$parse_method")
        parse_rule = r.get("$parse_rule")
        if not etree.iselement(x):
            raise Exception("Invalid etree element.")
        if value_type == ValueType.recursion:
            result = []
            if r.get("$each"):
                if parse_method == ParseType.path:
                    for ex in x.xpath(parse_rule):
                        result.append(self._extract_multi(ex, r.get("$each")))
                elif parse_method == ParseType.regex:
                    for ex in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8')):
                        result.append(self._extract_multi(etree.HTML(ex), r.get("$each")))
                    #raise Exception("RegEx can not be allowed to recursive.")
        elif value_type == ValueType.array:
            if parse_method == ParseType.path:
                result = [o.strip() if isinstance(o, basestring) else _collect_string_content(o) for o in x.xpath(parse_rule)]
            elif parse_method == ParseType.regex:
                result = [o.strip() for o in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8'))]
        elif value_type == ValueType.plain_text:
            if parse_method == ParseType.path:
                f = x.xpath(parse_rule)
                if isinstance(f, basestring):
                    result = f.strip()
                elif isinstance(f, list):
                    result = "\t".join([o.strip()
                                        if isinstance(o, basestring)
                                        else o.xpath('string(.)')
                                        for o in f
                                        ])
                    result = result.strip()
                else:
                    raise Exception("type {} can't convert to plain_text.".format(type(f)))
            elif parse_method == ParseType.regex:
                result = "\t".join([o.strip() for o in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8'))])
        elif value_type == ValueType.html:
            if parse_method == ParseType.path:
                result = "\t".join((etree2string(o) if o != None else "") for o in x.xpath(parse_rule))
            elif parse_method == ParseType.regex:
                result = "\t".join(ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8')))
        elif value_type == ValueType.link:
            result = []
            links = []
            if parse_method == ParseType.path:
                f = x.xpath(parse_rule)
                if isinstance(f, basestring):
                    links.append(f.strip().split('#')[0])

                else:
                    links.extend([o.strip().split('#')[0]
                             if isinstance(o, basestring)
                              else o.xpath('string(.)')
                             for o in f
                             ])
            elif parse_method == ParseType.regex:
                links.extend(o.strip().split('#')[0]for o in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8')))
            for l in links:
                if l.startswith('javascript:'):continue
                l = urljoin(self.base_url, l)
                result.append(Link(url=l, url_id=get_md5_i64(l), anchor=""))
        return result
Ejemplo n.º 12
0
    def _extract_single(self, x, r):
        result = ""
        value_type = r.get("$value_type")
        parse_method = r.get("$parse_method")
        parse_rule = r.get("$parse_rule")

        if not isinstance(x, list) and not isinstance(x, dict):
            raise Exception("{} type is {}, not list or dict".format(
                str(x), type(x)))
        if value_type == ValueType.recursion:
            result = []
            if r.get("$each"):
                if parse_method == ParseType.jsonpath:
                    tmp = parse(parse_rule).find(x)
                    if len(tmp) and isinstance(tmp[0].value, list):
                        for ex in tmp[0].value:
                            result.append(
                                self._extract_multi(ex, r.get("$each")))
                    elif len(tmp) and isinstance(tmp[0].value, dict):
                        result.append(
                            self._extract_multi(tmp[0].value, r.get("$each")))
                elif parse_rule == ParseType.regex:
                    raise Exception("RegEx can not be allowed to recursive.")
        elif value_type == ValueType.array:
            result = []
            if parse_method == ParseType.jsonpath:
                tmp = parse(parse_rule).find(x)
                if len(tmp) > 0:
                    if isinstance(tmp[0].value, list):
                        for each in tmp:
                            result.extend(each.value)
                    else:
                        for each in tmp:
                            result.append(each.value)
            elif parse_method == ParseType.regex:
                ss = json.dumps(x)
                tmp = ReExtends(ss).parse(parse_rule)
                if not isinstance(tmp, list):
                    tmp = [tmp]
                result.extend(tmp)
        elif value_type == ValueType.plain_text:
            result = []
            if parse_method == ParseType.jsonpath:
                tmp = parse(parse_rule).find(x)
                if len(tmp) > 0:
                    if not isinstance(tmp[0].value, basestring):
                        result = "\t".join(
                            [json.dumps(each.value) for each in tmp])
                    else:
                        result = "\t".join([each.value for each in tmp])
            elif parse_method == ParseType.regex:
                ss = json.dumps(x)
                tmp = ReExtends(ss.decode('utf-8')).parse(parse_rule)
                if not isinstance(tmp, list):
                    tmp = [tmp]
                result = "\t".join(tmp)
        elif value_type == ValueType.link:
            result = []
            links = []
            if parse_method == ParseType.jsonpath:
                tmp = parse(parse_rule).find(x)
                if len(tmp) > 0:
                    if isinstance(tmp[0].value, basestring):
                        links = [each.value for each in tmp]
            elif parse_method == ParseType.regex:
                ss = json.dumps(x)
                links = ReExtends(ss).parse(parse_rule)
                if not isinstance(links, list):
                    links = [links]
            for l in links:
                if l.startswith('javascript:'): continue
                l = urljoin(self.base_url, l)
                result.append(Link(url=l, url_id=get_md5_i64(l), anchor=u''))
        return result
Ejemplo n.º 13
0
    def modify_seeds(self, datas):
        seeds = {}
        count = 0
        site = None
        for data in datas:
            task = {}
            seed_id = -1
            count += 1
            task['variable_param_list'] = []
            task['page_turning_rule'] = {}
            task['session_commit'] = {}
            task['data'] = {}
            for i in range(len(data)):
                key = self.seed_keys[i][0]
                if key in ['mode', 'site_name']:
                    continue
                val = data[i]
                if not val and val != 0:
                    continue
                if key == 'id':
                    key = 'seed_id'
                    seed_id = val
                elif key.find('check_body') > -1:
                    val = val.replace('\'', '"')
                key = str_obj(key)
                val = str_obj(val)
                task[key] = self.macro_function(val)

                if key == "site":
                    site = val
                    site_id = get_md5_i64(val)
                if key == 'variable_params':
                    var_conf = json.loads(val)
                    if var_conf:
                        type = var_conf.get('type', "")
                    if type == 'mongo':
                        task['variable_param_list'] = []
                        mongo_conf = var_conf.get('mongo_data', {})
                        if mongo_conf:
                            task['variable_param_list'] = self.load_var_params(
                                mongo_conf)
                    elif type == 'map':
                        task['variable_param_list'] = var_conf.get(
                            'map_data', [])
                    elif type == 'json':
                        task['variable_param_list'] = var_conf.get(
                            'json_data', [])

                    task.pop('variable_params')
                elif key == 'page_turning_rule':
                    task['page_turning_rule'] = json.loads(val)
                elif key == 'session_commit':
                    task['session_commit'] = json.loads(val)
                elif key == 'data':
                    task['data'] = json.loads(self.macro_function(val))
                elif key == 'http_header':
                    task['http_header'] = json.loads(val)
                elif key == 'config_init_period':
                    task['config_init_period'] = json.loads(val)

            #重置翻页参数时需要使用
            #task['origin_url'] = task['url']
            #task['origin_data'] = task['data']
            #task['current_variable_param'] = json.dumps([])
            task['site_id'] = site_id
            seeds[seed_id] = task

        mysql_execute(self.mysql_conn,
                      'UPDATE seeds SET mode="off" WHERE is_once="true"')
        self.log.info("config_loader\tsite:%s\tseeds:%d" % (site, len(seeds)))
        return seeds
Ejemplo n.º 14
0
def test_parser_config():
    if not request.json:
        return jsonify({'status': 'failed', 'data': 'request error'})
    req_datas = request.json
    download_req = DownLoadReq()
    download_req.method = req_datas.get('request_method', 'get')
    download_req.url = req_datas.get('request_url')
    download_req.download_type = req_datas.get('download_type')
    download_req.post_data = {}
    download_req.http_header = {}
    try:
        download_req.http_header = json.loads(req_datas.get('headers'))
    except Exception as e:
        download_req.http_header = None
    post_data = None
    try:
        post_data = json.loads(req_datas.get('request_params'))
    except Exception as e:
        pass
    parser_id = req_datas.get('parser_id', "-1")
    page_source = req_datas.get('page_source').strip()

    if page_source not in ['cache', 'downloader', 'pagedb', 'input']:
        page_source = 'cache'
    hz_url = download_req.url
    if post_data and download_req.method == "post":
        hz_url = build_hzpost_url(download_req.url, post_data)
        download_req.url = hz_url
    spend_time = {}
    try:
        page_id = get_md5_i64(hz_url)
        download_rsp = None
        stime = time.time()

        if page_source == 'pagedb':
            download_rsp = current_app.config['crawler_merge'].select_one(
                hz_url)
            if download_rsp.status == 1:
                download_rsp = None
        elif page_source == 'cache':
            download_rsp = get_page_cache(page_id)
        elif page_source == 'input':
            download_rsp = DownLoadRsp()
            download_rsp.url = hz_url
            download_rsp.status = 0
            download_rsp.content_type = "text"
            download_rsp.http_code = 200
            download_rsp.download_time = 0
            download_rsp.content = req_datas.get('input_page',
                                                 "").encode('utf8')
            download_rsp.src_type = "input"
            download_rsp.elapsed = 50
        if not download_rsp:
            downloader = current_app.config['downloader']
            download_rsp = downloader.download(hz_url, download_req)
            download_rsp.url = hz_url
        spend_time['download_spend'] = (time.time() - stime) * 1000
        set_page_cache(page_id, download_rsp)
        is_save = req_datas.get('is_save', 'false')
        if is_save == "true":
            download_rsp.parse_extends = json.dumps({'parser_id': parser_id})
            download_rsp_tube = current_app.config[
                'put_beanstald_server'].get_tube_by_name('download_rsp_tube')
            if download_rsp_tube:
                current_app.config['put_beanstald_server'].save_record({
                    'tube_name':
                    download_rsp_tube,
                    'obj':
                    download_rsp
                })
        #复制download_rsp, 防止多线程修改
        download_rsp = deepcopy(download_rsp)
        download_rsp.parse_extends = json.dumps({
            "parser_id": parser_id,
            "debug": True
        })
        extractor = current_app.config['extractor']
        stime = time.time()
        extract_rsp = extractor.extract(download_rsp)
        spend_time['extract_spend'] = (time.time() - stime) * 1000
        #实体解析数据列表
        entity_datas = None
        #schema检查结果
        schema_check_result = None
        entity_rsps = None
        cur_datetime = str(datetime.datetime.now())
        try:
            stime = time.time()
            extract_data = extract_rsp.extract_info.extract_data
            if extract_data:
                extract_data_dict = json.loads(extract_data)
                _src = {
                    "url": extract_rsp.base_info.url,
                    "site_id": extract_rsp.base_info.site_id,
                    "site": extract_rsp.base_info.site
                }
                if "datas" in extract_data_dict:
                    datas = extract_data_dict['datas']
                    tmp_datas = []
                    for d in datas:
                        d['_src'] = [_src]
                        tmp_datas.append(d)
                    extract_data_dict['datas'] = tmp_datas
                else:
                    extract_data_dict['_src'] = [_src]
                extract_rsp.extract_info.extract_data = json.dumps(
                    extract_data_dict)
                entity_rsps = current_app.config[
                    'entity_extractor'].entity_extract(extract_rsp)
                spend_time['entity_spend'] = (time.time() - stime) * 1000
                entity_datas = []
                for data in entity_rsps.entity_data_list:
                    if data:
                        entity_datas.append(json.loads(data.entity_data))
                    else:
                        entity_datas.append(None)
        except Exception as e:
            if entity_rsps:
                entity_datas = {
                    'sys_error': e.message,
                    'error_message': entity_rsps.msg
                }
            else:
                entity_datas = {'sys_error': e.message}
        final_data = {}
        try:
            if entity_rsps.entity_data_list:
                entity_json = {
                    "topic_id": entity_rsps.entity_data_list[0].topic_id,
                    "data":
                    json.loads(entity_rsps.entity_data_list[0].entity_data)
                }
                datasaver_resp = current_app.config['data_saver'].check_data(
                    json.dumps(entity_json))
                final_data = json.loads(datasaver_resp.data)
        except Exception as e:
            final_data = {'sys_error': e.message}
        return jsonify({
            'status':
            True,
            'data':
            build_test_parser_config_rsp(extract_rsp, entity_datas, final_data,
                                         spend_time)
        })
    except Exception as e:
        current_app.config['logger'].error(hz_url)
        current_app.config['logger'].info(traceback.format_exc())
        return jsonify({'status': False, 'data': e.message})