Ejemplo n.º 1
0
 def find_webpage(self, link_url = "http://www.baidu.com/"):
     url_struct = get_url_info(link_url);
     domain = url_struct.get("domain", "baidu.com");
     urls_info =  self.webpage_db[domain].find({'url':link_url})
     for url_info in urls_info:
         url_info['_id'] = str(url_info['_id'])
         return url_info;
Ejemplo n.º 2
0
 def find_linkbase(self, link_url = "http://www.baidu.com/"):
     url_struct = get_url_info(link_url);
     domain = url_struct.get("domain", "baidu.com");
     urls_info =  self.linkbase_db[domain].find({'url':link_url})
     for url_info in urls_info:
         json_dict = {};
         link_str =  url_info['link_attr']
         link_attr = pickle.loads(link_str)
         if not link_attr:
             return None
         if not link_attr.url:
             link_attr.url = link_url
         link_attr = vars(link_attr)
         for key,val in link_attr.items():
             if not val:
                 json_dict[key] = None
             elif key == "crawl_info" or key == "parent_info" or key == "page_info" or key == 'extract_message':
                 json_dict[key] = vars(val)
             elif key == "normal_crawl_his" and val:
                 json_dict[key] = [] 
                 for his in val:
                     json_dict[key].append(vars(his))
             else:
                 json_dict[key] = str(val)
         return json_dict
Ejemplo n.º 3
0
 def nor_to_hz(self, request):
     if request.post_data and request.method == "post":
         url_info = get_url_info(request.url)
         query_info = url_query_decode(url_info.get('query'))
         query_info['HZPOST'] = base64_encode_json(request.post_data)
         hz_url = request.url.split("?")[0] + "?" + urllib.urlencode(
             query_info)
         request.url = hz_url
Ejemplo n.º 4
0
 def hz_to_nor(self, request):
     url_info = get_url_info(request.url)
     query_info = url_query_decode(url_info.get('query'))
     request.post_data = base64_decode_json(query_info.get('HZPOST'))
     query_info.pop('HZPOST')
     nor_url = request.url.split("?")[0] + "?" + urllib.urlencode(
         query_info)
     request.url = nor_url
     request.method = 'post'
Ejemplo n.º 5
0
 def _get_next_page_by_get_count(self):
     next_page = self.base_url
     url_info = get_url_info(self.base_url)
     query_info = url_query_decode(url_info.get('query'))
     new_query = self._build_count_query(query_info)
     if not new_query:
         return None
     else:
         next_page = next_page.split('?')[0] + "?" + urllib.urlencode(
             new_query)
     return next_page
Ejemplo n.º 6
0
def is_matched(url, rule):
    site_prefix = rule.get('site_prefix')
    url_format = rule.get('url_format')
    url_type = rule.get('url_type')
    site = get_url_info(url).get('site')
    if isinstance(site_prefix, basestring):
        m_site = re.match(site_prefix, site)
        if m_site and isinstance(url_format, basestring):
            m_url = re.match(url_format, url)
            if m_url and isinstance(url_type, int):
                return True
    return False
Ejemplo n.º 7
0
 def pack_base_info(self, download_rsp):
     """
     :param DonwloadRsp:
     :return BaseInfo:
     """
     base_info = BaseInfo()
     base_info.url = download_rsp.url
     url_info = get_url_info(base_info.url)
     base_info.domain_id = url_info.get('domain_id')
     base_info.domain = url_info.get('domain')
     base_info.site = url_info.get('site')
     base_info.site_id = url_info.get('site_id')
     base_info.url_id = url_info.get('url_id')
     base_info.src_type = download_rsp.src_type
     return base_info
Ejemplo n.º 8
0
 def _get_next_page_by_post_count(self):
     next_page = self.base_url
     url_info = get_url_info(self.base_url)
     query_info = url_query_decode(url_info.get('query'))
     post_param_str = query_info.get('HZPOST', None)
     if not post_param_str:
         return None
     post_param = base64_decode_json(post_param_str)
     next_post_param = self._build_count_query(post_param)
     if not next_post_param:
         return None
     else:
         query_info['HZPOST'] = base64_encode_json(next_post_param)
         next_page = next_page.split("?")[0] + "?" + urllib.urlencode(
             query_info)
     return next_page
Ejemplo n.º 9
0
    def start_convert(self, page_parseinfo):
        link_attr = None
        try:
            if not page_parseinfo: return None

            extractor_crawl_info = page_parseinfo.crawl_info
            base_info = page_parseinfo.base_info
            extract_info = page_parseinfo.extract_info
            url = base_info.url
            url_info = get_url_info(url)
            data_extends = page_parseinfo.data_extends
            self.log.info('merge_start\turl:{}'.format(url))
            #处理网页信息
            webpage_merge = WebpageMerge(self.webpage_connection)
            webpage_obj = webpage_merge.merge_webpage(
                base_info, extractor_crawl_info, page_parseinfo.data_extends,
                page_parseinfo.parse_extends)
            # 将处理完的网页存回数据库
            webpage_merge.save_webepage(base_info.domain, webpage_obj)

            # scheduler应为json格式字符串
            scheduler_obj = {}
            try:
                scheduler_obj = json.loads(page_parseinfo.scheduler)
            except Exception as e:
                pass
            base_crawl_info = dict(extractor_crawl_info.__dict__)
            base_crawl_info.update(scheduler_obj)

            if base_crawl_info.get("status_code") != 0:
                return None

            link_merge = LinkMerge(self.link_connection, url_info,
                                   extract_info.links, self.log)
            link_attr = link_merge.merge_link_attr(base_info, extract_info,
                                                   scheduler_obj,
                                                   base_crawl_info,
                                                   data_extends)
            # 将处理完的链接信息放回数据库
            link_merge.save_link_attrs()

            self.log.info('merge url:{}'.format(base_info.url))
            if base_info.src_type == "webpage":
                return None
        except Exception as e:
            self.log.error("url:{}".format(traceback.format_exc()))
        return link_attr
Ejemplo n.º 10
0
 def select_webpage_by_url(self, url):
     self.log.info("select_webpage_by_url start\turl:{}".format(url))
     url = url_encode(url)
     download_result = DownLoadRsp(url=url,
                                   download_time=int(time.time()),
                                   status=1,
                                   content_type='text/html',
                                   page_size=0,
                                   elapsed=100,
                                   content=None,
                                   redirect_url=url,
                                   src_type='webpage',
                                   http_code=0)
     try:
         query_item = {'url': url}
         domain = get_url_info(url).get('domain')
         result = self.mongo_client_web.find_first(domain, query_item)
         if result and (result.get('content')):
             download_result = self.get_download_rsp(result)
     except:
         self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format(
             url, traceback.format_exc()))
     self.log.info("select_webpage_by_url finish\turl:{}".format(url))
     return download_result
Ejemplo n.º 11
0
    def schedule_task(self, task):
        # dispatched_task_queue is the special queue, having the highest priority
        if not task:
            return None
        site_info = get_url_info(task.url)
        site_scheduler = self.get_site_scheduler(site_info)

        if site_scheduler:
            self.site_empty[site_scheduler.site_id] = False
        else:
            site_info = self.sites.get(site_info['site_id'], {})
            if site_info == {}:
                return None
            site_info['avg_interval'] = 10
            site_scheduler = SiteScheduler(site_info, self.conf['redis_tasks'],
                                           self.conf['log'],
                                           self.site_statistic,
                                           self.seed_statistic)
            self.site_schedulers[site_info['site_id']] = site_scheduler
            self.start_one_site_tasks(site_info['site'])
            self.log.info('schedule_task\turl:%s\tsite:%s\tnot_exit' %
                          (task.url, site_info['site']))

        return site_scheduler.schedule(task)
Ejemplo n.º 12
0
class Extractor(object):
    CHARSET_PATTERN = re.compile(
        '<meta[^>]*?(?:charset|CHARSET)=["\']?([a-zA-Z0-9\\-]+)["\']?[^>]*?>',
        re.I | re.S)
    ENCODING_PATTERN = re.compile(
        '<\?xml[^>]*? (?:encoding="[a-zA-Z0-9\\-]+")[^>]*?\?>', re.I | re.S)
    BR_PATTERN = re.compile("</\s*br\s*>", re.I | re.S)
    CONTENT_THRESHOLD = 200

    def __init__(self, conf):
        self.log = conf['log']
        self.log.info('Extractor load start')
        self.conf = conf
        self.config_handler = ConfigHandler(conf, self.log)
        self.plugin_handler = PluginHandler()
        self.log.info('Extractor load finish')

    def _get_charset(self, content):
        nodes = re.findall(self.CHARSET_PATTERN, content)
        if nodes:
            return nodes[0]
        else:
            return None

    # 对网页进行编码
    def decode_body(self, body, link, download_type='simple'):
        if not body or isinstance(body, unicode):
            return body, None
        if download_type == 'phantom':  # phantomjs抓取的网页一定是utf-8编码
            try:
                return body.decode('utf-8'), 'utf-8'
            except:
                pass
        charset = self._get_charset(body)
        for try_charset in ['utf-8', 'gb18030', 'gbk', 'utf-16']:
            try:
                return body.decode(try_charset), try_charset
            except Exception as e:
                pass
        try:
            return body.decode(charset, errors='ignore'), charset
        except Exception as e:
            pass
        self.log.warning(
            "the page from {} can't not correct decode".format(link))
        return None, None

    def pack_crawl_info(self, download_rsp):
        """
        :param DownloadRsp:
        :return CrawlInfo:
        """
        craw_info = CrawlInfo()
        craw_info.content = download_rsp.content
        craw_info.status_code = download_rsp.status
        craw_info.http_code = download_rsp.http_code
        craw_info.download_time = download_rsp.download_time
        craw_info.redirect_url = download_rsp.redirect_url
        craw_info.elapsed = download_rsp.elapsed
        craw_info.content_type = download_rsp.content_type
        craw_info.page_size = download_rsp.page_size
        return craw_info

    def pack_base_info(self, download_rsp):
        """
        :param DonwloadRsp:
        :return BaseInfo:
        """
        base_info = BaseInfo()
        base_info.url = download_rsp.url
        url_info = get_url_info(base_info.url)
        base_info.domain_id = url_info.get('domain_id')
        base_info.domain = url_info.get('domain')
        base_info.site = url_info.get('site')
        base_info.site_id = url_info.get('site_id')
        base_info.url_id = url_info.get('url_id')
        base_info.src_type = download_rsp.src_type
        return base_info

    def fix_links_info(self, links, custom_links, parser_config):
        """
        :param Links:
        :param link_extend_rule: [{'rule':'', 'parser_id':1}}
        :return:
        """
        rets = []
        links_set = {}
        for link in links:
            links_set[link.url] = link
        for cl in custom_links:
            if links_set.has_key(cl.url):
                links_set[cl.url].parse_extends = cl.parse_extends
                links_set[cl.url].type = cl.type
            else:
                links_set[cl.url] = cl
        links = links_set.values()
        filter_rule = []
        try:
            for ll in parser_config.urls_rule:
                if ll['$parse_method'] == u"filter":
                    filter_rule.append(ll)
        except Exception, e:
            self.log.warning("filter_links failed, because {}".format(str(e)))
        for link in links:
            try:
                link.url = tools.url_encode(link.url)
                url_info = tools.get_url_info(link.url)
                link.domain = url_info.get('domain')
                link.site = url_info.get('site')
                link.site_id = url_info.get('site_id')
                link.domain_id = url_info.get('domain_id')
                link.url_id = url_info.get('url_id')
                if not link.type:
                    for r in filter_rule:
                        try:
                            if r.get('$parse_rule') and re.findall(
                                    r['$parse_rule'], link.url):
                                link_type = str(r['$link_type'])
                                if not str.isdigit(link_type):
                                    link_type = LinkType.kUnknownLink
                                else:
                                    link_type = int(link_type)
                                parser_id = str(r['$parser_id'])
                                if not str.isdigit(parser_id):
                                    parser_id = -1
                                else:
                                    parser_id = int(parser_id)
                                link.type = link_type
                                link.parse_extends = json.dumps(
                                    {'parser_id': parser_id})
                                break
                        except Exception as e:
                            pass
                    if not link.type:
                        link.type = LinkType.kUnknownLink
                rets.append(link)
            except Exception as e:
                self.log.warning(
                    "fix_links_info of {} error, because of {}".format(
                        link.url, e.message))

        return rets