def sendtask2Rabbit(cls, message, func=None, exchange_name='', **kwargs): if not isinstance(message, list): return for msg in message: if "route_key" not in msg: raise exceptions.RoutingKEYNotExisted rabbitmsg = '' # 传入的值为一个字典类型,有meta信息 if isinstance(msg, dict): msg.update(kwargs) rabbitmsg = json.dumps(msg, ensure_ascii=False) if isinstance(msg, (str, unicode)): # 当没有meta信息的时候 resultmsgs = {'url': msg} if func: resultmsgs.update({"func": func}) resultmsgs.update(kwargs) rabbitmsg = json.dumps(resultmsgs, ensure_ascii=False) try: TaskQueue.channel.basic_publish( exchange=exchange_name, routing_key=msg['route_key'], body=rabbitmsg, properties=pika.BasicProperties( delivery_mode=constant.MESSAGE_PERSISTENCE, # make message persistent )) except Exception as e: logger.error(e) sys.exit() print " [x] Sent %r" % (rabbitmsg, )
def go_get_it(self, url_body): try: for middleware_method in self.url_preopenermiddleware: middleware_method(url_body) for middleware_method in self.url_openermiddleware: content = middleware_method(url_body) url_body[constant.RESPONSE_SIGNATURE] = content logger.debug(url_body[constant.RESPONSE_SIGNATURE]) # filter返回一个当前url下一级的所有链接,可以是list 也可以是dict # 这里可以根据每个网站的不同自定义抓取的方式,若_filter_middleware为None,将报 # NotImplement异常,这个过滤链后端必须实现的方法有is_filter(),filter(),分别是 # 1.判断是否要在当前的网页解析下一级的链接,is_filter # 2.filter 分析这个网页,找到所有下一级的链接 for middleware_method in self._filter_middleware: middleware_method(url_body) for middleware_method in self.url_parserermiddleware: middleware_method(url_body) for middleware_method in self._postfilter_middleware: middleware_method(url_body) for middleware_method in self.url_savemiddleware: middleware_method(url_body) # 将上面的rawlinks,就是下一级要抓取的链接加入到队列中和暂存区 if constant.RAW_LINKS in url_body: refined_links = fetch_util.refine_links( url_body[constant.RAW_LINKS]) next_page(refined_links) except requests.exceptions.InvalidSchema as e: logger.error(e)
def simpleExtractorAsList(cls, regx, content, *args, **kwargs): uniques = [] try: regx = re.compile(regx) templist = re.findall(regx, content) uniques.extend(templist) return uniques except Exception as e: logger.error(" [x] Error occor") logger.exception(e) return uniques
def import_by_path(dotted_path, error_prefix=''): """ Import a dotted module path and return the attribute/class designated by the last name in the path. Raise ImproperlyConfigured if something goes wrong. """ try: attr = import_string(dotted_path) except ImportError as e: msg = '%sError importing module %s: "%s"' % ( error_prefix, dotted_path, e) logger.error(e) six.reraise(ImproperlyConfigured, ImproperlyConfigured(msg), sys.exc_info()[2]) return attr
def load_middleware(self): for middleware_path in settings.MIDDLEWARE_CLASSES: logger.debug("middleware_path: {0}".format(middleware_path)) try: mw_module, mw_classname = middleware_path.rsplit('.', 1) except ValueError: raise exceptions.ImproperlyConfigured( '%s isn\'t a middleware module' % middleware_path) try: mod = importlib.import_module(mw_module) except ImportError as e: raise exceptions.ImproperlyConfigured( 'Error importing middleware %s: "%s"' % (mw_module, e)) try: mw_class = getattr(mod, mw_classname) except AttributeError: raise exceptions.ImproperlyConfigured( 'Middleware module "%s" does not define a "%s" class' % (mw_module, mw_classname)) try: mw_instance = mw_class() except exceptions.MiddlewareNotUsed as e: logger.error(e) continue if hasattr(mw_instance, 'process_filter'): self._filter_middleware.append(mw_instance.process_filter) if hasattr(mw_instance, 'process_postfilter'): self._postfilter_middleware.append( mw_instance.process_postfilter) if hasattr(mw_instance, 'process_preopen'): self.url_preopenermiddleware.append( mw_instance.process_preopen) if hasattr(mw_instance, 'process_open'): self.url_openermiddleware.append(mw_instance.process_open) if hasattr(mw_instance, 'process_parser'): self.url_parserermiddleware.append(mw_instance.process_parser) if hasattr(mw_instance, 'process_save'): self.url_savemiddleware.append(mw_instance.process_save)
def saveHTML(content, **kwargs): ''' 保存html.会根据settings.py提供的正则保存 ''' url = kwargs['url'] file_ext = ".html" # 以/从最右边分割开始,助剂建立目录建立目录 dirstr = url.rsplit('/', 1)[0] # 去掉http:// pattern = re.compile(r'http://') dirstr = re.sub(pattern, '', dirstr) # 去掉http:// pattern = re.compile(r'https://') dirstr = re.sub(pattern, '', dirstr) # 去掉换行符 pattern = re.compile(r'\n') dirstr = re.sub(pattern, '', dirstr) # 再根据dirstr以/划分 current_date = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d') top_dir = os.path.join(os.getcwd(), dirstr) top_dir = os.path.join(top_dir, current_date) try: if not os.path.exists(top_dir): os.makedirs(top_dir) except OSError as e: if e.errno == os.errno.EEXIST: message = "'%s' already exists" % top_dir logger.error(message) else: message = e logger.info(message) return -1 filepath = os.path.join(top_dir, str(uuid.uuid4()) + file_ext) write_txt_into_file(content, filepath) logger.info('The file is saved into %s' % filepath)