def process_filter(self, url_body): """返回待抓取链接的url列表""" content = url_body.get(constant.RESPONSE_SIGNATURE) logger.debug("parser url content: {0}".format(content)) logger.info("process url: {0}, filter raw links".format( url_body["url"])) raw_links = content_parser.parser(content, url_body) url_body[constant.RAW_LINKS] = raw_links
def refine_links(rawlinklist): # 去除重复 links = list(set(rawlinklist)) # 访问过的url不再访问 links = filter(check_visited, links) # 给url打标签 links = tag_info_to_links(settings.RABBITMQ_QUEUE, settings.DOMAIN, links) logger.info("refined links: {0}".format(links)) return links
def callback(self, ch, method, properties, body): logger.info(" [x] Received %r" % (body, )) urlpack = body if not urlpack: ch.basic_ack(delivery_tag=method.delivery_tag) return packed = json.loads(urlpack) self._process(packed) ch.basic_ack(delivery_tag=method.delivery_tag) logger.info(" [x] Done")
def consumer(cls, _process): try: channel = connection.channel() logger.info(' [*] Waiting for messages. To exit press CTRL+C') channel.basic_qos(prefetch_count=1) callback = StartFuture(_process).callback channel.basic_consume(callback, queue=settings.RABBITMQ_QUEUE) except Exception: sys.exit(0) channel.start_consuming()
def saveHTML(content, **kwargs): ''' 保存html.会根据settings.py提供的正则保存 ''' url = kwargs['url'] file_ext = ".html" # 以/从最右边分割开始,助剂建立目录建立目录 dirstr = url.rsplit('/', 1)[0] # 去掉http:// pattern = re.compile(r'http://') dirstr = re.sub(pattern, '', dirstr) # 去掉http:// pattern = re.compile(r'https://') dirstr = re.sub(pattern, '', dirstr) # 去掉换行符 pattern = re.compile(r'\n') dirstr = re.sub(pattern, '', dirstr) # 再根据dirstr以/划分 current_date = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d') top_dir = os.path.join(os.getcwd(), dirstr) top_dir = os.path.join(top_dir, current_date) try: if not os.path.exists(top_dir): os.makedirs(top_dir) except OSError as e: if e.errno == os.errno.EEXIST: message = "'%s' already exists" % top_dir logger.error(message) else: message = e logger.info(message) return -1 filepath = os.path.join(top_dir, str(uuid.uuid4()) + file_ext) write_txt_into_file(content, filepath) logger.info('The file is saved into %s' % filepath)
def process_parser(self, url_body): """返回从当前页面提取到的数据""" logger.info("process url: {0}, parse content".format(url_body["url"])) content = url_body.get(constant.RESPONSE_SIGNATURE) data = content_parser.parser_content(content, url_body=url_body) url_body[constant.REFINED_DATA] = data
def getlinks(self, content): regx = r'<div class="b-slb-item">.*?<h3>.*?<a href="(.*?)">.*?</div>' links = Extractor.simpleExtractorAsList(regx, content) logger.info("get content links: {0}".format(links)) return links
def process_save(self, url_body, **kwargs): """调用页面保存后端存储网页到存储设备上""" logger.info("process url : {0} storage".format(url_body.get("url"))) PageSave().save(url_body[constant.RESPONSE_SIGNATURE], **url_body)