def crawling(): print('-----start crawling time: %s-----' % (datetime.today())) config = configparser.ConfigParser() config.read('..\\config.ini', 'utf-8') news_pool = get_news_pool() crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'], config['DEFAULT']['doc_encoding'])
def crawling(): print('-----start crawling time: %s-----'%(datetime.today())) config = configparser.ConfigParser() config.read('../config.ini', 'utf-8') root = 'http://news.sohu.com/1/0903/61/subject212846158' max_page = get_max_page(root + '.shtml') news_pool = get_news_pool(root, max_page, max_page - 5) crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'], config['DEFAULT']['doc_encoding'])
def crawling(): print('-----start crawling time: %s-----' % (datetime.today())) config = configparser.ConfigParser() config.read('../config.ini', 'utf-8') # 新闻的标题列表页:如综合要闻页:http://news.gpnu.edu.cn/index/zhxw.htm root = 'http://news.gpnu.edu.cn/index/zhxw' # max_page = get_max_page(root + '.shtml') print('***', root + '.htm') max_page = get_max_page(root + '.htm') news_pool = get_news_pool(root, max_page, max_page - 5) print("=========这是分隔线=========") crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'], config['DEFAULT']['doc_encoding'])