Exemple #1
0
    def crawl(self):
        # 获取一个消息
        pre_information = get_message(self.msg_queue)
        if pre_information is None:
            return "NO_MESSAGE", {}

        # 获取需要爬取的网页地址
        if DATA_URL in pre_information:
            url = pre_information[DATA_URL]
        else:
            url = pre_information[URL]

        # 设置爬虫浏览目录页方式
        if self.flip_page_type == 1 and not pre_information.get(GENERATE_ALL_PAGE):
            pre_information[GENERATE_ALL_PAGE] = True

        # 获取与url相匹配的parser
        matched_parser = get_matched_parser(url)
        if matched_parser is None:
            return "NO_PARSER", pre_information
        re_url, func = matched_parser

        # 如果需要休眠,则爬虫进行休眠
        sleep_second = pre_information.get(SLEEP_SECOND, 0)
        sleep_second += pre_information.get(SLEEP_FOR_CONTENT, 0)
        if sleep_second > 0:
            time.sleep(sleep_second)

        # 获取网页内容
        method = pre_information.get(METHOD, "GET").upper()
        if method == "POST":
            params = pre_information.get(PARAMS, {})
            html = self.post(url, data=params, timeout=40)
        else:
            html = self.get(url, timeout=60)
        if html is None:
            if NO_CONTENT_TIMES not in pre_information:
                pre_information[NO_CONTENT_TIMES] = 0
                pre_information[SLEEP_FOR_CONTENT] = 0
            if pre_information[NO_CONTENT_TIMES] < 3:
                pre_information[NO_CONTENT_TIMES] += 1
                pre_information[SLEEP_FOR_CONTENT] += 4
                publish_message(self.msg_queue, pre_information)
            return "NO_CONTENT", pre_information
        else:
            pre_information[NO_CONTENT_TIMES] = 0
            pre_information[SLEEP_FOR_CONTENT] = 0

        # 解析网页获取继续访问的链接和已经解析成功的公告内容
        try:
            links, contents = func(html, pre_information)
        except KeyboardInterrupt as e:
            raise e
        except Exception, exception:
            import traceback
            traceback.print_exc(exception)
            logger.error("Parse html failed. %s\n%s" % (exception.message,
                                                        json.dumps(pre_information).decode("unicode-escape")))
            return "PARSER_ERROR", pre_information
Exemple #2
0
def test_start_urls():
    skips = [
        "www.chinabidding.com",
        "cz.fjzfcg.gov.cn",
    ]
    fixes = [
        'cqjbjyzx',
        'fjjyzx'
    ]

    urls = skip_urls(page_start.start_urls, skips)
    # urls = fix_urls(urls, fixes)
    urls = filter_with(urls, {ORIGIN_REGION: u"重庆"})
    # urls = filter_without(urls, {ORIGIN_REGION: u"重庆"})

    queue_name = "test_start_urls"
    success_list, fail_list = [], []
    for i, item in enumerate(urls):
        print "%d:\t" % i, "test start url: ", show_start_url(item)
        clear_queue(queue_name)
        publish_message(queue_name, item)
        bid_crawler = BidCrawler(queue_name, skip_parse_failure=True)
        results = []
        for k in range(10):
            res, info = bid_crawler.crawl()
            if res is "NO_MESSAGE":
                break
            if res is not "SUCCESS":
                results.append("%s: %s" % (k, res))
        if len(results) != 0:
            print_red("crawl start url failed: ")
            print_green(results)
            fail_list.append(item)
        else:
            success_list.append(item)

    print "\nTotal success urls are %s: " % len(success_list)
    for i, item in enumerate(success_list):
        print "%3d: %s" % (i, show_start_url(item))

    print_red("\nTotal fail urls are %s: " % len(fail_list))
    for i, item in enumerate(fail_list):
        print_red("%3d: %s" % (i, show_start_url(item)))

    import util
    hosts = set()
    for item in success_list:
        host = util.get_host_address(item.get("url"))
        hosts.add(host)
    hosts = sorted(hosts)
    print "\nAvailable Hosts:"
    for host in hosts:
        print '"%s",' % host
Exemple #3
0
def test_crawler():
    info = [{
        ORIGIN_REGION: u"重庆>>巴南区",
        URL: "http://jy.bnzw.gov.cn/LBv3/n_newslist_zz.aspx?Item=100026",
        ANNOUNCE_TYPE: u"中标公示",
        PROJECT_TYPE: u"工程建设",
        WEBSITE: u"重庆市巴南区行政服务和公共资源交易中心",
        NOTE: u"重庆市重庆綦江公共资源综合交易网巴南区行政服务和公共资源交易中心-中标公示"
    }]

    queue.publish_message("test", info)
    bid_crawler = crawler.BidCrawler("test")
    bid_crawler.run()
Exemple #4
0
    def run(self):
        is_monitor = (self.policy.CRAWLER_TYPE == 2)  # 是否是定时监控
        schedule_seconds = self.policy.SCHEDULE_TIME  # 监控定时
        for idx, sec in enumerate(schedule_seconds):
            schedule_seconds[idx] += random.randint(-1700, 1700)
        schedule_seconds = sorted(schedule_seconds)
        schedule_seconds.append(24 * 3600 + schedule_seconds[0])

        while True:
            thread_list = []
            queue.publish_message(self.queue_name, self.msg)

            crawler_number = self.policy.CRAWLER_NUMBER
            for m in self.msg:
                if m.get(MAX_CRAWLER_NUMBER
                         ) and m.get(MAX_CRAWLER_NUMBER) < crawler_number:
                    crawler_number = m.get(MAX_CRAWLER_NUMBER)

            try:
                for i in range(crawler_number):
                    crawler = BidCrawler(
                        msg_queue=self.queue_name,
                        is_monitor=is_monitor,
                        check_published_ts=self.policy.APPLY_TIME_INTERVAL,
                        start_ts=self.policy.TIME_INTERVAL_ST,
                        end_ts=self.policy.TIME_INTERVAL_ED,
                        skip_parse_failure=True)
                    thread = Thread(target=crawler.run)
                    thread.start()
                    thread_list.append(thread)

                for thread in thread_list:
                    thread.join()

                if is_monitor:
                    now = datetime.datetime.now()
                    seconds_of_day = (now.hour * 3600 + now.minute * 60 +
                                      now.second)
                    for second in schedule_seconds:
                        if second > seconds_of_day:
                            sleep_seconds = second - seconds_of_day
                            logger.info(
                                "The crawler will be started after %s seconds."
                                % sleep_seconds)
                            time.sleep(sleep_seconds)
                            break
                else:
                    break
            except KeyboardInterrupt, e:
                print e.message
                break
Exemple #5
0
def test_crawler():
    info = [{
        URL: "http://www.hljcg.gov.cn/xwzs!index.action"
    }, {
        ORIGIN_REGION: u"黑龙江",
        URL:
        "http://www.hljcg.gov.cn/xwzs!queryXwxxqx.action?lbbh=5&xwzsPage.pageNo=1",
        ANNOUNCE_TYPE: u"成交公告",
        NOTE: u"黑龙江省政府采购网-成交公告"
    }]

    queue.publish_message("test", info)

    bid_crawler = crawler.BidCrawler("test")
    bid_crawler.run()
Exemple #6
0
class BidCrawler(BaseCrawler):
    sleep_time = 1  # 休眠时间

    def __init__(self, msg_queue,
                 is_monitor=False,  # 是否是定时监控
                 check_published_ts=False,  # 是否检查网页发布时间
                 start_ts="2000-01-01",  # 爬取网页发布时间的起始时间
                 end_ts="",  # 爬取网页发布时间的结束时间
                 skip_parse_failure=False,  # 是否跳过解析错误
                 flip_page_type=0):  # 翻页方式 (0: 依次查找下一页,主要用于监控; 1: 一次产生全部的页,主要用于全量)
        super(BidCrawler, self).__init__()
        self.msg_queue = msg_queue
        self.is_monitor = is_monitor
        self.check_published_ts = check_published_ts
        self.start_ts = start_ts
        self.end_ts = end_ts
        self.skip_parse_failure = skip_parse_failure
        self.flip_page_type = flip_page_type

    def crawl(self):
        # 获取一个消息
        pre_information = get_message(self.msg_queue)
        if pre_information is None:
            return "NO_MESSAGE", {}

        # 获取需要爬取的网页地址
        if DATA_URL in pre_information:
            url = pre_information[DATA_URL]
        else:
            url = pre_information[URL]

        # 设置爬虫浏览目录页方式
        if self.flip_page_type == 1 and not pre_information.get(GENERATE_ALL_PAGE):
            pre_information[GENERATE_ALL_PAGE] = True

        # 获取与url相匹配的parser
        matched_parser = get_matched_parser(url)
        if matched_parser is None:
            return "NO_PARSER", pre_information
        re_url, func = matched_parser

        # 如果需要休眠,则爬虫进行休眠
        sleep_second = pre_information.get(SLEEP_SECOND, 0)
        sleep_second += pre_information.get(SLEEP_FOR_CONTENT, 0)
        if sleep_second > 0:
            time.sleep(sleep_second)

        # 获取网页内容
        method = pre_information.get(METHOD, "GET").upper()
        if method == "POST":
            params = pre_information.get(PARAMS, {})
            html = self.post(url, data=params, timeout=40)
        else:
            html = self.get(url, timeout=60)
        if html is None:
            if NO_CONTENT_TIMES not in pre_information:
                pre_information[NO_CONTENT_TIMES] = 0
                pre_information[SLEEP_FOR_CONTENT] = 0
            if pre_information[NO_CONTENT_TIMES] < 3:
                pre_information[NO_CONTENT_TIMES] += 1
                pre_information[SLEEP_FOR_CONTENT] += 4
                publish_message(self.msg_queue, pre_information)
            return "NO_CONTENT", pre_information
        else:
            pre_information[NO_CONTENT_TIMES] = 0
            pre_information[SLEEP_FOR_CONTENT] = 0

        # 解析网页获取继续访问的链接和已经解析成功的公告内容
        try:
            links, contents = func(html, pre_information)
        except KeyboardInterrupt as e:
            raise e
        except Exception, exception:
            import traceback
            traceback.print_exc(exception)
            logger.error("Parse html failed. %s\n%s" % (exception.message,
                                                        json.dumps(pre_information).decode("unicode-escape")))
            return "PARSER_ERROR", pre_information

        # 并处理解析后的结果
        if self.is_monitor:
            for cont in contents:
                URL_POOL.add_url(cont[UNI_ORIGIN_ID], cont[URL])
            links = _check_links(links)

        if self.check_published_ts:
            links = _check_published_ts(links, self.start_ts, self.end_ts)
        links = _filter_links(links)

        _store_contents(contents)
        publish_message(self.msg_queue, links)
        return "SUCCESS", pre_information