Ejemplo n.º 1
0
def main():
    """
    爬虫入口
    """
    p = ArgumentParser()
    p.add_argument('-v', action='version', version='1.0', help='version')
    p.add_argument('-c', default='spider.conf', help='config name')
    args = p.parse_args()

    conf = config_load.SpiderConfig()
    conf.load_conf(args.c)
    hosts = copy.deepcopy(conf.urls)
    hosts = list(set(hosts))
    u_table = url_table.UrlTable(hosts)
    web_save = webpage_save.WebSave(conf.output_directory)
    web_parse = webpage_parse.WebParse(conf.target_url)

    # 创建队列实例
    url_queue = queue.Queue()
    # 生成一个线程池
    for i in range(conf.thread_count):
        t = crawl_thread.CrawlClass(url_queue, u_table, conf, web_save,
                                    web_parse)
        # 主程序退出时,子线程也立即退出
        t.setDaemon(True)
        # 启动线程
        t.start()

    # 向队列中填充URLs
    cur_depth = 0
    depth = conf.max_depth
    while cur_depth <= depth:
        for host in hosts:
            url_queue.put(host)
            time.sleep(conf.crawl_interval)
        cur_depth += 1
        web_parse.cur_depth = cur_depth
        url_queue.join()
        hosts = copy.deepcopy(u_table.todo_list)
        u_table.todo_list = []
Ejemplo n.º 2
0
def main():
    """
    Entry
    """
    p = ArgumentParser()
    p.add_argument('-v', action='version', version='1.0', help='version')
    p.add_argument('-c', default='spider.conf', help='config name')
    args = p.parse_args()

    conf = config_load.SpiderConfig()
    conf.load_conf(args.c)
    hosts = copy.deepcopy(conf.urls)
    hosts = list(set(hosts))
    u_table = url_table.UrlTable(hosts)
    web_save = webpage_save.WebSave(conf.output_directory)
    web_parse = webpage_parse.WebParse(conf.target_url)

    # initiate a queue
    url_queue = queue.Queue()
    # create a thread pool
    for i in range(conf.thread_count):
        t = crawl_thread.CrawlClass(url_queue, u_table, conf, web_save,
                                    web_parse)
        # quit the child thread if the main thread is dead
        t.setDaemon(True)
        # start the thread
        t.start()

    # add to queue
    cur_depth = 0
    depth = conf.max_depth
    while cur_depth <= depth:
        for host in hosts:
            url_queue.put(host)
            time.sleep(conf.crawl_interval)
        cur_depth += 1
        web_parse.cur_depth = cur_depth
        url_queue.join()
        hosts = copy.deepcopy(u_table.todo_list)
        u_table.todo_list = []
Ejemplo n.º 3
0
    def add_url(self, ans):
        """
        如果地址不与已有的重复, 则添加到todo_list
        """
        if lock.acquire():
            if ans not in self.u_table.all_urls:
                self.u_table.all_urls[ans] = 0
                self.u_table.add_todo_list(ans)
            else:
                logging.debug("Duplicated url: %s" % ans)
            lock.release()
        else:
            logging.debug("Lock error")


if __name__ == '__main__':
    conf = config_load.SpiderConfig()
    conf.load_conf()
    queue = queue.Queue()
    u_table = url_table.UrlTable()

    th = CrawlClass(queue)
    th.u_table = u_table
    th.config = conf
    th.setDaemon(True)
    th.start()

    queue.put(conf.urls[0])
    queue.join()
    print(th.u_table.todo_list)