def get_info(driver): """获取一个页面中的所有数据""" global Count_num type_ = '' tCount = 0 for page in range(1, 121): tCount += 1 """如果满的话,每一页有120个主播""" try: #主播名字 dy_name = driver.find_element_by_css_selector('#live-list-contentbox > li:nth-child({}) > a > div.mes > p > span.dy-name.ellipsis.fl'.format(page)).text #直播分类 dy_type = driver.find_element_by_css_selector('#live-list-contentbox > li:nth-child({}) > a > div.mes > div > span'.format(page)).text #直播标题 dy_title = ddy_title = driver.find_element_by_css_selector("#live-list-contentbox > li:nth-child({}) > a > div.mes > div > h3".format(page)).text #直播热度 dy_num = driver.find_element_by_css_selector("#live-list-contentbox > li:nth-child({}) > a > div.mes > p > span.dy-num.fr".format(page)).text if dy_num.isdigit(): pass else: dy_num = dy_num.replace('万','') dy_num = int(float(dy_num)*10000) #转换为数字 #主播标签 tags = '' for i in range(1,4): try: dy_tag = driver.find_element_by_css_selector("#live-list-contentbox > li:nth-child({}) > a > div.impress-tag-list > span:nth-child({})".format(page, i)).text tags = tags + "[" + dy_tag + "] " except: # print("该主播没有标签!") tags = '该主播没有标签!' break except: logger.info("{}类已经爬取完毕,一共有{}条数据。".format(type_, tCount)) logger.info("一共已经爬取{}条数据!".format(Count_num)) break Sql_insert = "INSERT INTO douyu3(dy_name,dy_type,dy_title,dy_num,dy_tag)VALUES('%s','%s','%s','%d','%s')"%(dy_name,dy_type,dy_title,int(dy_num),tags) print("正在爬取第{}个主播:".format(Count_num)) # print("名字:", dy_name) # print("分类:", dy_type) # print("标题:", dy_title) # print("热度:", dy_num ) # print("标签:", tags) try: insert(Sql_insert) except: print("插入错误!") logger.info("{}插入错误!".format(dy_name)) pass Count_num += 1
def get_info(source): """获取一个页面中的所有数据""" global Count_num selector = etree.HTML(source) print(selector) type_ = '' tCount = 0 for page in range(1, 121): tCount += 1 """如果满的话,每一页有120个主播""" try: #主播名字 dy_name = selector.xpath( '//*[@id="live-list-contentbox"]/li[%d]/a/div[1]/p/span[1]/text()' % (page))[0] #直播分类 dy_type = selector.xpath( '//*[@id="live-list-contentbox"]/li[%d]/a/div[1]/div/span/text()' % (page))[0] #直播标题 dy_title = selector.xpath( '//*[@id="live-list-contentbox"]/li[{}]/a/div[1]/div/h3/text()' .format(page))[0] #直播热度 dy_num = selector.xpath( '//*[@id="live-list-contentbox"]/li[{}]/a/div[1]/p/span[2]/text()' .format(page))[0] if dy_num.isdigit(): pass else: dy_num = dy_num.replace('万', '') dy_num = int(float(dy_num) * 10000) #转换为数字 #主播标签 tags = '' for i in range(1, 4): try: dy_tag = selector.xpath( '//*[@id="live-list-contentbox"]/li[%d]/a/div[2]/span[%d]/text()' % (page, i))[0] tags = tags + "[" + dy_tag + "] " except: # print("该主播没有标签!") tags = '该主播没有标签!' break except: logger.info("{}类已经爬取完毕,一共有{}条数据。".format(type_, tCount)) logger.info("一共已经爬取{}条数据!".format(Count_num)) break Sql_insert = "INSERT INTO douyu3(dy_name,dy_type,dy_title,dy_num,dy_tag)VALUES('%s','%s','%s','%d','%s')" % ( dy_name, dy_type, dy_title, int(dy_num), tags) print("正在爬取第{}个主播!".format(Count_num)) # print("名字:", dy_name) # print("分类:", dy_type) # print("标题:", dy_title) # print("热度:", dy_num ) # print("标签:", tags) try: insert(Sql_insert) except: print("插入错误!") logger.info("{}插入错误!".format(dy_name)) pass Count_num += 1