def start_one_site_tasks(self, site): site_id = get_md5_i64(site) self.log.info("start on site:%s, site_id:%s" % (site, site_id)) site_scheduler = self.site_schedulers.get(site_id, None) if not site_scheduler: self.log.info("当前需要加载站点信息: site = {}".format(site)) sites = self.loader.load_sites(site) self.log.info("加载站点信息完成: site = {}".format(site)) if sites and len(sites) > 0: site_info = sites[site] site_id = int(site_info['site_id']) site_scheduler = SiteScheduler(site_info, self.conf['redis_tasks'], self.conf['log'], self.site_statistic, self.seed_statistic) self.site_schedulers[site_id] = site_scheduler else: try: self.log.info("当前需要加载站点信息: site = {}".format(site)) sites = self.loader.load_sites(site) self.log.info("加载站点信息完成: site = {}".format(site)) if sites and len(sites) > 0 and sites.has_key(site): site_info = sites[site] site_scheduler.reload_site(site_info) except Exception, e: self.log.error("load sites info fail:%s" % traceback.format_exc()) return False
def _extract_links(self, selector, base_url, response_encoding='utf-8', links =[]): # hacky way to get the underlying lxml parsed document links_set = {} for link in links: links_set[link.url] = link for el, attr, attr_val in self._iter_links(selector): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: attr_val = urljoin("", attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue if url.startswith("javascript"):continue #skip javascript # to fix relative links after process_value url = urljoin(base_url, url) url = url.split('#')[0] anchor = _collect_string_content(el) or '' if isinstance(anchor, unicode): anchor = anchor.encode('utf-8') anchor = anchor.replace('\r', '').replace('\n', '').replace('\t', " ").strip() link = Link(url=url, url_id=get_md5_i64(url), anchor=anchor, ) links_set[link.url] = link #links.append(link) return links_set.values()
def load_sites(self, site=''): sites = {} if site: sql_str = 'SELECT site, name, avg_interval, encoding FROM `site` WHERE site="' \ + str(site) + '" ORDER BY `id`' else: sql_str = 'SELECT site, name, avg_interval, encoding FROM `site` ORDER BY `id`' datas = mysql_fetch(self.mysql_conn, sql_str) for data in datas: task = {} site = data[0] task['site'] = site task['name'] = data[1] task['avg_interval'] = data[2] task['encoding'] = data[3] task['site_id'] = get_md5_i64(site) task['cookies'] = [] cookie_sql = 'SELECT site, user_id, cookie FROM `cookie` where site ="%s" ORDER BY `id`' % site # cookies = self.fetch_mysql(cookie_sql) cookies = mysql_fetch(self.mysql_conn, cookie_sql) for cookie in cookies: cookie_dict = {} cookie_dict['user_id'] = cookie[1] cookie_dict['cookie'] = cookie[2].encode("utf8") task['cookies'].append(cookie_dict) sites[site] = task self.log.info("sites:" + str(len(sites))) return sites
def pack_extract_info(self, extract_info, page_data, download_rsp, parser_config): """ :param ExtractData: :param DownloadRsp: :return ExtractInfo: """ extract_data = page_data.get('data', {}) common_data = page_data.get('common_data') links = page_data.get('links') extract_info.ex_status = ExStatus.kEsNotExtract # default not extract extract_info.redirect_url = download_rsp.redirect_url extract_info.content_time = common_data.get('public_time') extract_info.topic_id = parser_config.topic_id extract_info.html_tag_title = common_data.get('title') extract_info.page_text = common_data.get('content') extract_info.content_language = common_data.get('lang') extract_info.content_finger = common_data.get('content_finger') if not extract_info.content_finger and extract_info.page_text: extract_info.content_finger = tools.get_md5_i64( extract_info.page_text) extract_info.links = links extract_info.extract_data = json.dumps(extract_data) self.log.info("url:%s\textract_data_length:%d" % (str(download_rsp.url), len(extract_info.extract_data))) return extract_info
def _get_crawl_history(self, extract_info, crawl_info): crawl_historys = [] crawl_history = CrawlHistory() try: crawl_history.download_time = crawl_info.download_time crawl_history.content_finger = extract_info.content_finger crawl_history.link_finger = extract_info.link_finger crawl_history.real_title = extract_info.html_tag_title crawl_history.analyse_title = extract_info.analyse_title crawl_history.content_time = extract_info.content_time crawl_history.page_type = extract_info.content_type crawl_history.status_code = crawl_info.status_code crawl_history.http_code = crawl_info.http_code crawl_history.page_size = crawl_info.page_size self.inner_links.sort( key=lambda inner_link: get_md5_i64(inner_link.url), reverse=True) inner_links_str = ','.join( map(lambda link: link.url, self.inner_links)) crawl_history.inner_link_finger = get_md5_i64(inner_links_str) crawl_history.inner_links_num = len(self.inner_links) crawl_history.outer_links_num = len(self.outer_links) # todo 以下待补充 crawl_history.new_links_num_for_self = 0 crawl_history.good_links_num_for_for_self = 0 crawl_history.new_links_num_for_all = 0 crawl_history.good_links_num_for_all = 0 crawl_history.dead_page_type = 0 crawl_history.dead_page_time = 0 except Exception as e: self.logger.warning(e.message) pass crawl_historys.append(crawl_history) try: old_history_link = self.history_links_cache.get( self.url_info.get('url')) if old_history_link and old_history_link.get('link_attr'): crawl_historys.extend( old_history_link['link_attr'].normal_crawl_his) crawl_historys = crawl_historys[:7] except Exception as e: #发生异常应该是没有历史了吧 pass return crawl_historys
def restart_seed(self, seed_id, site): self.log.info('restart seed, seed_id:%s, site:%s' % (seed_id, site)) site_id = get_md5_i64(site) seeds = self.loader.load_seed_by_id(seed_id, True) site_scheduler = self.site_schedulers.get(site_id, None) if not site_scheduler: return True site_scheduler.add_seed_json(seeds) return True
def clear_one_site_cache(self, site): site_id = get_md5_i64(site) site_scheduler = self.site_schedulers.get(site_id, None) if not site_scheduler: self.log.warning( "clear on site cache fail, site:%s, site_id:%s, site_scheduler_key:%s" % (site, site_id, self.site_schedulers.keys())) return True try: site_scheduler.clear_one_site_cache() except Exception, e: self.log.error("clear site:%s cache error: %s" % (site, traceback.format_exc())) return False
def stop_one_site_tasks(self, site): self.log.info('stop site:%s' % site) site_id = get_md5_i64(site) site_scheduler = self.site_schedulers.get(site_id, None) if not site_scheduler: self.log.info("stop_one_site_tasks\tsite:%s\t not_exist" % site) return False try: self.running_site_schedulers.pop(site_id) except KeyError, e: self.log.warning( "site_id:%s not in self.running_site_schedulers:%s" % (site_id, self.running_site_schedulers)) pass
def process_request(self, request): try: site_id = request.url.split('/')[2] except: site_id = request.get('url') site_id = get_md5_i64(site_id) try: site_id = int(site_id) except: site_id = None cookies = get_cookie(site_id) if not cookies: return cookie = cookies[random.randint(0, sys.maxint) % len(cookies)] self.logger.info(" using cookie site_id: %s user_id: %s" % (site_id, cookie['user_id'])) request.info.setdefault('headers', {})['Cookie'] = cookie['cookie']
def _extract_common_data(self, xdoc): result = {} try: result = SFExtractor().extract(etree2string(xdoc, pretty_print=True).decode(encoding='utf-8')) except Exception: pass lang = xdoc.xpath('string(//html/@lang)') if isinstance(lang, unicode): lang = lang.encode('utf-8') if lang == '': # default zh-ch lang = 'zh-ch' result['lang'] = lang result['content'] = result.get('content', '').encode('utf-8') result['title'] = result.get('title', '').encode('utf-8') result['content_finger'] = get_md5_i64(result.get('content', '')) try: result['public_time'] = parser_tool.string2timestamp(result.get('public_time', '')) if result['public_time'] > 2147483648 or result['public_time'] < -2147483648: result['public_time'] = None except Exception as e: result['public_time'] = None return result
def _extract_single(self, x, r): result = "" value_type = r.get("$value_type") parse_method = r.get("$parse_method") parse_rule = r.get("$parse_rule") if not etree.iselement(x): raise Exception("Invalid etree element.") if value_type == ValueType.recursion: result = [] if r.get("$each"): if parse_method == ParseType.path: for ex in x.xpath(parse_rule): result.append(self._extract_multi(ex, r.get("$each"))) elif parse_method == ParseType.regex: for ex in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8')): result.append(self._extract_multi(etree.HTML(ex), r.get("$each"))) #raise Exception("RegEx can not be allowed to recursive.") elif value_type == ValueType.array: if parse_method == ParseType.path: result = [o.strip() if isinstance(o, basestring) else _collect_string_content(o) for o in x.xpath(parse_rule)] elif parse_method == ParseType.regex: result = [o.strip() for o in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8'))] elif value_type == ValueType.plain_text: if parse_method == ParseType.path: f = x.xpath(parse_rule) if isinstance(f, basestring): result = f.strip() elif isinstance(f, list): result = "\t".join([o.strip() if isinstance(o, basestring) else o.xpath('string(.)') for o in f ]) result = result.strip() else: raise Exception("type {} can't convert to plain_text.".format(type(f))) elif parse_method == ParseType.regex: result = "\t".join([o.strip() for o in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8'))]) elif value_type == ValueType.html: if parse_method == ParseType.path: result = "\t".join((etree2string(o) if o != None else "") for o in x.xpath(parse_rule)) elif parse_method == ParseType.regex: result = "\t".join(ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8'))) elif value_type == ValueType.link: result = [] links = [] if parse_method == ParseType.path: f = x.xpath(parse_rule) if isinstance(f, basestring): links.append(f.strip().split('#')[0]) else: links.extend([o.strip().split('#')[0] if isinstance(o, basestring) else o.xpath('string(.)') for o in f ]) elif parse_method == ParseType.regex: links.extend(o.strip().split('#')[0]for o in ReExtends(etree2string(x)).parse(parse_rule.encode('utf-8'))) for l in links: if l.startswith('javascript:'):continue l = urljoin(self.base_url, l) result.append(Link(url=l, url_id=get_md5_i64(l), anchor="")) return result
def _extract_single(self, x, r): result = "" value_type = r.get("$value_type") parse_method = r.get("$parse_method") parse_rule = r.get("$parse_rule") if not isinstance(x, list) and not isinstance(x, dict): raise Exception("{} type is {}, not list or dict".format( str(x), type(x))) if value_type == ValueType.recursion: result = [] if r.get("$each"): if parse_method == ParseType.jsonpath: tmp = parse(parse_rule).find(x) if len(tmp) and isinstance(tmp[0].value, list): for ex in tmp[0].value: result.append( self._extract_multi(ex, r.get("$each"))) elif len(tmp) and isinstance(tmp[0].value, dict): result.append( self._extract_multi(tmp[0].value, r.get("$each"))) elif parse_rule == ParseType.regex: raise Exception("RegEx can not be allowed to recursive.") elif value_type == ValueType.array: result = [] if parse_method == ParseType.jsonpath: tmp = parse(parse_rule).find(x) if len(tmp) > 0: if isinstance(tmp[0].value, list): for each in tmp: result.extend(each.value) else: for each in tmp: result.append(each.value) elif parse_method == ParseType.regex: ss = json.dumps(x) tmp = ReExtends(ss).parse(parse_rule) if not isinstance(tmp, list): tmp = [tmp] result.extend(tmp) elif value_type == ValueType.plain_text: result = [] if parse_method == ParseType.jsonpath: tmp = parse(parse_rule).find(x) if len(tmp) > 0: if not isinstance(tmp[0].value, basestring): result = "\t".join( [json.dumps(each.value) for each in tmp]) else: result = "\t".join([each.value for each in tmp]) elif parse_method == ParseType.regex: ss = json.dumps(x) tmp = ReExtends(ss.decode('utf-8')).parse(parse_rule) if not isinstance(tmp, list): tmp = [tmp] result = "\t".join(tmp) elif value_type == ValueType.link: result = [] links = [] if parse_method == ParseType.jsonpath: tmp = parse(parse_rule).find(x) if len(tmp) > 0: if isinstance(tmp[0].value, basestring): links = [each.value for each in tmp] elif parse_method == ParseType.regex: ss = json.dumps(x) links = ReExtends(ss).parse(parse_rule) if not isinstance(links, list): links = [links] for l in links: if l.startswith('javascript:'): continue l = urljoin(self.base_url, l) result.append(Link(url=l, url_id=get_md5_i64(l), anchor=u'')) return result
def modify_seeds(self, datas): seeds = {} count = 0 site = None for data in datas: task = {} seed_id = -1 count += 1 task['variable_param_list'] = [] task['page_turning_rule'] = {} task['session_commit'] = {} task['data'] = {} for i in range(len(data)): key = self.seed_keys[i][0] if key in ['mode', 'site_name']: continue val = data[i] if not val and val != 0: continue if key == 'id': key = 'seed_id' seed_id = val elif key.find('check_body') > -1: val = val.replace('\'', '"') key = str_obj(key) val = str_obj(val) task[key] = self.macro_function(val) if key == "site": site = val site_id = get_md5_i64(val) if key == 'variable_params': var_conf = json.loads(val) if var_conf: type = var_conf.get('type', "") if type == 'mongo': task['variable_param_list'] = [] mongo_conf = var_conf.get('mongo_data', {}) if mongo_conf: task['variable_param_list'] = self.load_var_params( mongo_conf) elif type == 'map': task['variable_param_list'] = var_conf.get( 'map_data', []) elif type == 'json': task['variable_param_list'] = var_conf.get( 'json_data', []) task.pop('variable_params') elif key == 'page_turning_rule': task['page_turning_rule'] = json.loads(val) elif key == 'session_commit': task['session_commit'] = json.loads(val) elif key == 'data': task['data'] = json.loads(self.macro_function(val)) elif key == 'http_header': task['http_header'] = json.loads(val) elif key == 'config_init_period': task['config_init_period'] = json.loads(val) #重置翻页参数时需要使用 #task['origin_url'] = task['url'] #task['origin_data'] = task['data'] #task['current_variable_param'] = json.dumps([]) task['site_id'] = site_id seeds[seed_id] = task mysql_execute(self.mysql_conn, 'UPDATE seeds SET mode="off" WHERE is_once="true"') self.log.info("config_loader\tsite:%s\tseeds:%d" % (site, len(seeds))) return seeds
def test_parser_config(): if not request.json: return jsonify({'status': 'failed', 'data': 'request error'}) req_datas = request.json download_req = DownLoadReq() download_req.method = req_datas.get('request_method', 'get') download_req.url = req_datas.get('request_url') download_req.download_type = req_datas.get('download_type') download_req.post_data = {} download_req.http_header = {} try: download_req.http_header = json.loads(req_datas.get('headers')) except Exception as e: download_req.http_header = None post_data = None try: post_data = json.loads(req_datas.get('request_params')) except Exception as e: pass parser_id = req_datas.get('parser_id', "-1") page_source = req_datas.get('page_source').strip() if page_source not in ['cache', 'downloader', 'pagedb', 'input']: page_source = 'cache' hz_url = download_req.url if post_data and download_req.method == "post": hz_url = build_hzpost_url(download_req.url, post_data) download_req.url = hz_url spend_time = {} try: page_id = get_md5_i64(hz_url) download_rsp = None stime = time.time() if page_source == 'pagedb': download_rsp = current_app.config['crawler_merge'].select_one( hz_url) if download_rsp.status == 1: download_rsp = None elif page_source == 'cache': download_rsp = get_page_cache(page_id) elif page_source == 'input': download_rsp = DownLoadRsp() download_rsp.url = hz_url download_rsp.status = 0 download_rsp.content_type = "text" download_rsp.http_code = 200 download_rsp.download_time = 0 download_rsp.content = req_datas.get('input_page', "").encode('utf8') download_rsp.src_type = "input" download_rsp.elapsed = 50 if not download_rsp: downloader = current_app.config['downloader'] download_rsp = downloader.download(hz_url, download_req) download_rsp.url = hz_url spend_time['download_spend'] = (time.time() - stime) * 1000 set_page_cache(page_id, download_rsp) is_save = req_datas.get('is_save', 'false') if is_save == "true": download_rsp.parse_extends = json.dumps({'parser_id': parser_id}) download_rsp_tube = current_app.config[ 'put_beanstald_server'].get_tube_by_name('download_rsp_tube') if download_rsp_tube: current_app.config['put_beanstald_server'].save_record({ 'tube_name': download_rsp_tube, 'obj': download_rsp }) #复制download_rsp, 防止多线程修改 download_rsp = deepcopy(download_rsp) download_rsp.parse_extends = json.dumps({ "parser_id": parser_id, "debug": True }) extractor = current_app.config['extractor'] stime = time.time() extract_rsp = extractor.extract(download_rsp) spend_time['extract_spend'] = (time.time() - stime) * 1000 #实体解析数据列表 entity_datas = None #schema检查结果 schema_check_result = None entity_rsps = None cur_datetime = str(datetime.datetime.now()) try: stime = time.time() extract_data = extract_rsp.extract_info.extract_data if extract_data: extract_data_dict = json.loads(extract_data) _src = { "url": extract_rsp.base_info.url, "site_id": extract_rsp.base_info.site_id, "site": extract_rsp.base_info.site } if "datas" in extract_data_dict: datas = extract_data_dict['datas'] tmp_datas = [] for d in datas: d['_src'] = [_src] tmp_datas.append(d) extract_data_dict['datas'] = tmp_datas else: extract_data_dict['_src'] = [_src] extract_rsp.extract_info.extract_data = json.dumps( extract_data_dict) entity_rsps = current_app.config[ 'entity_extractor'].entity_extract(extract_rsp) spend_time['entity_spend'] = (time.time() - stime) * 1000 entity_datas = [] for data in entity_rsps.entity_data_list: if data: entity_datas.append(json.loads(data.entity_data)) else: entity_datas.append(None) except Exception as e: if entity_rsps: entity_datas = { 'sys_error': e.message, 'error_message': entity_rsps.msg } else: entity_datas = {'sys_error': e.message} final_data = {} try: if entity_rsps.entity_data_list: entity_json = { "topic_id": entity_rsps.entity_data_list[0].topic_id, "data": json.loads(entity_rsps.entity_data_list[0].entity_data) } datasaver_resp = current_app.config['data_saver'].check_data( json.dumps(entity_json)) final_data = json.loads(datasaver_resp.data) except Exception as e: final_data = {'sys_error': e.message} return jsonify({ 'status': True, 'data': build_test_parser_config_rsp(extract_rsp, entity_datas, final_data, spend_time) }) except Exception as e: current_app.config['logger'].error(hz_url) current_app.config['logger'].info(traceback.format_exc()) return jsonify({'status': False, 'data': e.message})