Esempio n. 1
0
 def init_url_table(self):
     """初始化url_table
        Args:
        Returns:返回初始化对象
     """
     test_url_table = url_table.UrlTable(self.logger)
     return test_url_table
Esempio n. 2
0
 def setUp(self):
     """测试准备
        Args:
        Returns:
     """
     spider_log = log.Log()
     self.logger = spider_log.get_log('log', 'test.log', 'ERROR')
     self.test_url_table = url_table.UrlTable(self.logger)
     self.url_node = {}
     self.url_node['url'] = 'www.baidu.com'
     self.url_node_list = []
     self.url_node_list.append(self.url_node)
Esempio n. 3
0
def main():
    """
    func entrance
    """
    #0. 处理命令行参数
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument("-v", "--version", \
            help="print current version of spider program!", action="store_true")
    arg_parser.add_argument("-c", "--spider_conf", help="add the spider conf!")
    args = arg_parser.parse_args()

    SPIDER_CONF = args.spider_conf

    if args.version:
        print "version is 1.0"

    if SPIDER_CONF is None:
        print "please input -h to see help!"
        return

    #1. 读取配置
    conf_parser = config_load.SpiderConfigure(SPIDER_CONF)
    thread_num = int(conf_parser.get_info("spider", "thread_count"))
    max_depth = int(conf_parser.get_info("spider", "max_depth"))
    crawl_interval = int(conf_parser.get_info("spider", "crawl_interval"))
    crawl_timeout = int(conf_parser.get_info("spider", "crawl_timeout"))
    url_seed_path = conf_parser.get_info("spider", "url_list_file")
    seed_list = get_url_list(url_seed_path)

    #2. webPageParser
    web_parser = webpage_parse.WebPageParser(crawl_timeout)
    url_table_ins = url_table.UrlTable(seed_list, max_depth)
    threads = []

    #3 create thread
    save_path = './download_page'
    for i in range(thread_num):
        name = "thread_" + str(i)
        thread = crawl_thread.CrawlThread(max_depth, crawl_interval, \
                web_parser, url_table_ins, name, save_path)
        thread.setDaemon(True)
        thread.start()
        threads.append(thread)

    url_table_ins.spider_queue.join()

    crawl_log.ERROR_LOG("crawl main thread finished!")
Esempio n. 4
0
def main():
    """
    爬虫入口
    """
    p = ArgumentParser()
    p.add_argument('-v', action='version', version='1.0', help='version')
    p.add_argument('-c', default='spider.conf', help='config name')
    args = p.parse_args()

    conf = config_load.SpiderConfig()
    conf.load_conf(args.c)
    hosts = copy.deepcopy(conf.urls)
    hosts = list(set(hosts))
    u_table = url_table.UrlTable(hosts)
    web_save = webpage_save.WebSave(conf.output_directory)
    web_parse = webpage_parse.WebParse(conf.target_url)

    # 创建队列实例
    url_queue = queue.Queue()
    # 生成一个线程池
    for i in range(conf.thread_count):
        t = crawl_thread.CrawlClass(url_queue, u_table, conf, web_save,
                                    web_parse)
        # 主程序退出时,子线程也立即退出
        t.setDaemon(True)
        # 启动线程
        t.start()

    # 向队列中填充URLs
    cur_depth = 0
    depth = conf.max_depth
    while cur_depth <= depth:
        for host in hosts:
            url_queue.put(host)
            time.sleep(conf.crawl_interval)
        cur_depth += 1
        web_parse.cur_depth = cur_depth
        url_queue.join()
        hosts = copy.deepcopy(u_table.todo_list)
        u_table.todo_list = []
Esempio n. 5
0
def main():
    """
    Entry
    """
    p = ArgumentParser()
    p.add_argument('-v', action='version', version='1.0', help='version')
    p.add_argument('-c', default='spider.conf', help='config name')
    args = p.parse_args()

    conf = config_load.SpiderConfig()
    conf.load_conf(args.c)
    hosts = copy.deepcopy(conf.urls)
    hosts = list(set(hosts))
    u_table = url_table.UrlTable(hosts)
    web_save = webpage_save.WebSave(conf.output_directory)
    web_parse = webpage_parse.WebParse(conf.target_url)

    # initiate a queue
    url_queue = queue.Queue()
    # create a thread pool
    for i in range(conf.thread_count):
        t = crawl_thread.CrawlClass(url_queue, u_table, conf, web_save,
                                    web_parse)
        # quit the child thread if the main thread is dead
        t.setDaemon(True)
        # start the thread
        t.start()

    # add to queue
    cur_depth = 0
    depth = conf.max_depth
    while cur_depth <= depth:
        for host in hosts:
            url_queue.put(host)
            time.sleep(conf.crawl_interval)
        cur_depth += 1
        web_parse.cur_depth = cur_depth
        url_queue.join()
        hosts = copy.deepcopy(u_table.todo_list)
        u_table.todo_list = []
Esempio n. 6
0
 def init_url_queue(self):
     """初始化url_queue
     Args:
     Returns:    url_queue
     """
     #获取root_url
     root_url_list = self.get_url_list(self.url_list_file)
     if root_url_list == -1:
         self.logger.warn("get root url fail")
         return -1
     #初始化父节点
     father_node_list = []
     for url in root_url_list:
         url_node = {}
         url_node['url'] = url
         url_node['level'] = 0
         url_node['father'] = url
         father_node_list.append(url_node)
     #初始化url_queue
     url_queue = url_table.UrlTable(self.logger)
     url_queue.add_url_node_list(father_node_list)
     return url_queue
Esempio n. 7
0
    def add_url(self, ans):
        """
        如果地址不与已有的重复, 则添加到todo_list
        """
        if lock.acquire():
            if ans not in self.u_table.all_urls:
                self.u_table.all_urls[ans] = 0
                self.u_table.add_todo_list(ans)
            else:
                logging.debug("Duplicated url: %s" % ans)
            lock.release()
        else:
            logging.debug("Lock error")


if __name__ == '__main__':
    conf = config_load.SpiderConfig()
    conf.load_conf()
    queue = queue.Queue()
    u_table = url_table.UrlTable()

    th = CrawlClass(queue)
    th.u_table = u_table
    th.config = conf
    th.setDaemon(True)
    th.start()

    queue.put(conf.urls[0])
    queue.join()
    print(th.u_table.todo_list)
Esempio n. 8
0
 def __init__(self):
     self.url_table = url_table.UrlTable()
     self.crawl = crawl.Crawl()
     self.webpage_parse = webpage_parse.WebPageParse()
     self.webpage_save = webpage_save.WebPageSave()