Beispiel #1
0
 def process_filter(self, url_body):
     """返回待抓取链接的url列表"""
     content = url_body.get(constant.RESPONSE_SIGNATURE)
     logger.debug("parser url content: {0}".format(content))
     logger.info("process url: {0}, filter raw links".format(
         url_body["url"]))
     raw_links = content_parser.parser(content, url_body)
     url_body[constant.RAW_LINKS] = raw_links
Beispiel #2
0
def refine_links(rawlinklist):
    # 去除重复
    links = list(set(rawlinklist))
    # 访问过的url不再访问
    links = filter(check_visited, links)
    # 给url打标签
    links = tag_info_to_links(settings.RABBITMQ_QUEUE, settings.DOMAIN, links)
    logger.info("refined links: {0}".format(links))
    return links
Beispiel #3
0
 def callback(self, ch, method, properties, body):
     logger.info(" [x] Received %r" % (body, ))
     urlpack = body
     if not urlpack:
         ch.basic_ack(delivery_tag=method.delivery_tag)
         return
     packed = json.loads(urlpack)
     self._process(packed)
     ch.basic_ack(delivery_tag=method.delivery_tag)
     logger.info(" [x] Done")
Beispiel #4
0
    def consumer(cls, _process):
        try:
            channel = connection.channel()
            logger.info(' [*] Waiting for messages. To exit press CTRL+C')
            channel.basic_qos(prefetch_count=1)
            callback = StartFuture(_process).callback
            channel.basic_consume(callback, queue=settings.RABBITMQ_QUEUE)
        except Exception:
            sys.exit(0)

        channel.start_consuming()
Beispiel #5
0
def saveHTML(content, **kwargs):
    '''
      保存html.会根据settings.py提供的正则保存
    '''
    url = kwargs['url']
    file_ext = ".html"
    # 以/从最右边分割开始,助剂建立目录建立目录
    dirstr = url.rsplit('/', 1)[0]
    # 去掉http://
    pattern = re.compile(r'http://')
    dirstr = re.sub(pattern, '', dirstr)
    # 去掉http://
    pattern = re.compile(r'https://')
    dirstr = re.sub(pattern, '', dirstr)
    # 去掉换行符
    pattern = re.compile(r'\n')
    dirstr = re.sub(pattern, '', dirstr)
    # 再根据dirstr以/划分
    current_date = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d')
    top_dir = os.path.join(os.getcwd(), dirstr)
    top_dir = os.path.join(top_dir, current_date)
    try:
        if not os.path.exists(top_dir):
            os.makedirs(top_dir)
    except OSError as e:
        if e.errno == os.errno.EEXIST:
            message = "'%s' already exists" % top_dir
            logger.error(message)
        else:
            message = e
            logger.info(message)
        return -1

    filepath = os.path.join(top_dir, str(uuid.uuid4()) + file_ext)
    write_txt_into_file(content, filepath)
    logger.info('The file is saved into %s' % filepath)
Beispiel #6
0
 def process_parser(self, url_body):
     """返回从当前页面提取到的数据"""
     logger.info("process url: {0}, parse content".format(url_body["url"]))
     content = url_body.get(constant.RESPONSE_SIGNATURE)
     data = content_parser.parser_content(content, url_body=url_body)
     url_body[constant.REFINED_DATA] = data
Beispiel #7
0
 def getlinks(self, content):
     regx = r'<div class="b-slb-item">.*?<h3>.*?<a href="(.*?)">.*?</div>'
     links = Extractor.simpleExtractorAsList(regx, content)
     logger.info("get content links: {0}".format(links))
     return links
Beispiel #8
0
 def process_save(self, url_body, **kwargs):
     """调用页面保存后端存储网页到存储设备上"""
     logger.info("process url : {0} storage".format(url_body.get("url")))
     PageSave().save(url_body[constant.RESPONSE_SIGNATURE], **url_body)