コード例 #1
0
ファイル: rabbitmq.py プロジェクト: SamChen1981/spider-1
    def sendtask2Rabbit(cls, message, func=None, exchange_name='', **kwargs):
        if not isinstance(message, list):
            return
        for msg in message:
            if "route_key" not in msg:
                raise exceptions.RoutingKEYNotExisted
            rabbitmsg = ''
            # 传入的值为一个字典类型,有meta信息
            if isinstance(msg, dict):
                msg.update(kwargs)
                rabbitmsg = json.dumps(msg, ensure_ascii=False)
            if isinstance(msg, (str, unicode)):
                # 当没有meta信息的时候
                resultmsgs = {'url': msg}
                if func:
                    resultmsgs.update({"func": func})
                resultmsgs.update(kwargs)
                rabbitmsg = json.dumps(resultmsgs, ensure_ascii=False)
            try:
                TaskQueue.channel.basic_publish(
                    exchange=exchange_name,
                    routing_key=msg['route_key'],
                    body=rabbitmsg,
                    properties=pika.BasicProperties(
                        delivery_mode=constant.MESSAGE_PERSISTENCE,
                        # make message persistent
                    ))
            except Exception as e:
                logger.error(e)
                sys.exit()

            print " [x] Sent %r" % (rabbitmsg, )
コード例 #2
0
ファイル: base.py プロジェクト: SamChen1981/spider-1
 def go_get_it(self, url_body):
     try:
         for middleware_method in self.url_preopenermiddleware:
             middleware_method(url_body)
         for middleware_method in self.url_openermiddleware:
             content = middleware_method(url_body)
             url_body[constant.RESPONSE_SIGNATURE] = content
             logger.debug(url_body[constant.RESPONSE_SIGNATURE])
         # filter返回一个当前url下一级的所有链接,可以是list 也可以是dict
         # 这里可以根据每个网站的不同自定义抓取的方式,若_filter_middleware为None,将报
         # NotImplement异常,这个过滤链后端必须实现的方法有is_filter(),filter(),分别是
         # 1.判断是否要在当前的网页解析下一级的链接,is_filter
         # 2.filter 分析这个网页,找到所有下一级的链接
         for middleware_method in self._filter_middleware:
             middleware_method(url_body)
         for middleware_method in self.url_parserermiddleware:
             middleware_method(url_body)
         for middleware_method in self._postfilter_middleware:
             middleware_method(url_body)
         for middleware_method in self.url_savemiddleware:
             middleware_method(url_body)
         # 将上面的rawlinks,就是下一级要抓取的链接加入到队列中和暂存区
         if constant.RAW_LINKS in url_body:
             refined_links = fetch_util.refine_links(
                 url_body[constant.RAW_LINKS])
             next_page(refined_links)
     except requests.exceptions.InvalidSchema as e:
         logger.error(e)
コード例 #3
0
 def simpleExtractorAsList(cls, regx, content, *args, **kwargs):
     uniques = []
     try:
         regx = re.compile(regx)
         templist = re.findall(regx, content)
         uniques.extend(templist)
         return uniques
     except Exception as e:
         logger.error(" [x] Error occor")
         logger.exception(e)
     return uniques
コード例 #4
0
def import_by_path(dotted_path, error_prefix=''):
    """
    Import a dotted module path and return the attribute/class designated by the
    last name in the path. Raise ImproperlyConfigured if something goes wrong.
    """

    try:
        attr = import_string(dotted_path)
    except ImportError as e:
        msg = '%sError importing module %s: "%s"' % (
            error_prefix, dotted_path, e)
        logger.error(e)
        six.reraise(ImproperlyConfigured, ImproperlyConfigured(msg),
                    sys.exc_info()[2])
    return attr
コード例 #5
0
ファイル: base.py プロジェクト: SamChen1981/spider-1
    def load_middleware(self):

        for middleware_path in settings.MIDDLEWARE_CLASSES:
            logger.debug("middleware_path: {0}".format(middleware_path))
            try:
                mw_module, mw_classname = middleware_path.rsplit('.', 1)
            except ValueError:
                raise exceptions.ImproperlyConfigured(
                    '%s isn\'t a middleware module' % middleware_path)
            try:
                mod = importlib.import_module(mw_module)
            except ImportError as e:
                raise exceptions.ImproperlyConfigured(
                    'Error importing middleware %s: "%s"' % (mw_module, e))
            try:
                mw_class = getattr(mod, mw_classname)
            except AttributeError:
                raise exceptions.ImproperlyConfigured(
                    'Middleware module "%s" does not define a "%s" class' %
                    (mw_module, mw_classname))
            try:
                mw_instance = mw_class()
            except exceptions.MiddlewareNotUsed as e:
                logger.error(e)
                continue

            if hasattr(mw_instance, 'process_filter'):
                self._filter_middleware.append(mw_instance.process_filter)
            if hasattr(mw_instance, 'process_postfilter'):
                self._postfilter_middleware.append(
                    mw_instance.process_postfilter)
            if hasattr(mw_instance, 'process_preopen'):
                self.url_preopenermiddleware.append(
                    mw_instance.process_preopen)
            if hasattr(mw_instance, 'process_open'):
                self.url_openermiddleware.append(mw_instance.process_open)
            if hasattr(mw_instance, 'process_parser'):
                self.url_parserermiddleware.append(mw_instance.process_parser)
            if hasattr(mw_instance, 'process_save'):
                self.url_savemiddleware.append(mw_instance.process_save)
コード例 #6
0
def saveHTML(content, **kwargs):
    '''
      保存html.会根据settings.py提供的正则保存
    '''
    url = kwargs['url']
    file_ext = ".html"
    # 以/从最右边分割开始,助剂建立目录建立目录
    dirstr = url.rsplit('/', 1)[0]
    # 去掉http://
    pattern = re.compile(r'http://')
    dirstr = re.sub(pattern, '', dirstr)
    # 去掉http://
    pattern = re.compile(r'https://')
    dirstr = re.sub(pattern, '', dirstr)
    # 去掉换行符
    pattern = re.compile(r'\n')
    dirstr = re.sub(pattern, '', dirstr)
    # 再根据dirstr以/划分
    current_date = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d')
    top_dir = os.path.join(os.getcwd(), dirstr)
    top_dir = os.path.join(top_dir, current_date)
    try:
        if not os.path.exists(top_dir):
            os.makedirs(top_dir)
    except OSError as e:
        if e.errno == os.errno.EEXIST:
            message = "'%s' already exists" % top_dir
            logger.error(message)
        else:
            message = e
            logger.info(message)
        return -1

    filepath = os.path.join(top_dir, str(uuid.uuid4()) + file_ext)
    write_txt_into_file(content, filepath)
    logger.info('The file is saved into %s' % filepath)