def parseJuejinInfo( ): #https://juejin.im/tag/%E6%8E%98%E9%87%91%E7%BF%BB%E8%AF%91%E8%AE%A1%E5%88%92 #网页信息 page_url = "https://juejin.im/tag/%E6%8E%98%E9%87%91%E7%BF%BB%E8%AF%91%E8%AE%A1%E5%88%92" title_xpath = "//div[@class='info-row title-row']/a/text()" link_xpath = "//div[@class='info-row title-row']/a/@href" #Excel文件名称 file_name = os.getcwd() + "\Juejin_articles.xls" author_name = "Juejin" # 写入表头数据 headerData = [ [ "文章标题", "文章链接", ], ] opeExcel.create_excel_sheet(file_name, author_name) opeExcel.write_excel_xls_append(file_name, author_name, headerData) #将HTML源码字符串转换尘土HTML对象 page_html = getEtreeHTML(page_url) # 博客文章的标题 title_list = parseEtreeHTML(page_html, title_xpath) # 博客文章的链接 link_list = parseEtreeHTML(page_html, link_xpath) print("" + str(len(title_list))) print("" + str(len(link_list))) visitList(title_list) # 将数据保存到excel表格中 opeExcel.write_excel_xls_append_2(file_name, author_name, title_list, link_list)
def parseGityuanHTML(): #http://gityuan.com/archive/ #网页信息 page_url = "http://gityuan.com/archive/" title_xpath = "//div[@class='post-preview']/a/text()" link_xpath = "//div[@class='post-preview']/a/@href" # Excel文件名称 file_name = os.getcwd() + "\Gityuan_articles.xls" author_name = "Gityuan" # 写入表头数据 headerData = [ [ "文章标题", "文章链接", ], ] opeExcel.create_excel_sheet(file_name, author_name) opeExcel.write_excel_xls_append(file_name, author_name, headerData) #将HTML源码字符串转换尘土HTML对象 page_html = getEtreeHTML(page_url) # 博客文章的标题 title_list = parseEtreeHTML(page_html, title_xpath) # 博客文章的链接 link_list = parseEtreeHTML(page_html, link_xpath) #Gityuan的网站返回的都是不带"http://gityuan.com"的链接信息 for i in range(0, len(link_list)): link_list[i] = "http://gityuan.com" + link_list[i] # 将数据保存到excel表格中 opeExcel.write_excel_xls_append_2(file_name, author_name, title_list, link_list)
def parseLightMoon( ): #http://light3moon.com/1986/12/20/%E6%96%87%E7%AB%A0%E7%B4%A2%E5%BC%95/ #网页信息 page_url = "http://light3moon.com/1986/12/20/%E6%96%87%E7%AB%A0%E7%B4%A2%E5%BC%95/" title_xpath = "//div[@class='article-content']/p/a/text()" link_xpath = "//div[@class='article-content']/p/a/@href" #Excel文件名称 file_name = os.getcwd() + "\LightMoon_articles.xls" author_name = "LightMoon" # 写入表头数据 headerData = [ [ "文章标题", "文章链接", ], ] opeExcel.create_excel_sheet(file_name, author_name) opeExcel.write_excel_xls_append(file_name, author_name, headerData) #将HTML源码字符串转换尘土HTML对象 page_html = getEtreeHTML(page_url) # 博客文章的标题 title_list = parseEtreeHTML(page_html, title_xpath) # 博客文章的链接 link_list = parseEtreeHTML(page_html, link_xpath) # 将数据保存到excel表格中 opeExcel.write_excel_xls_append_2(file_name, author_name, title_list, link_list)
def getCnblogsInfo(): #https://www.cnblogs.com/Jax/default.html?page=1 title_xpath = "//a[@class='postTitle2 vertical-middle']/span/text()" link_xpath = "//a[@class='postTitle2 vertical-middle']/@href" date_xpath = "//div[@class='dayTitle']/a/text()" # 写入Excel文件的表头数据,即第一行数据 headerData = [ [ "文章标题", "文章链接", "发布日期", ], ] # 博主名字 author_name = "baojianqiang" # 博主博文页数 page_num = 999999 # page_num = int(input("请输入博客页数: ")) # Excel文件名称 file_name = os.getcwd() + "\cnblogs_articles.xls" opeExcel.create_excel_sheet(file_name, author_name) opeExcel.write_excel_xls_append(file_name, author_name, headerData) # 循环每页 allNumber = 0 #文章总数 for index in range(1, page_num + 1): # 拼接URL page_url = "https://www.cnblogs.com/Jax/default.html?page=" + str( index) page_html = getEtreeHTML(page_url) title_list = parseEtreeHTML(page_html, title_xpath) if len(title_list) == 0: print(author_name + "文章获取完毕,共计文章数目:" + str(allNumber)) allNumber = 0 break link_list = parseEtreeHTML(page_html, link_xpath) date_list = parseEtreeHTML(page_html, date_xpath) #该博客中日期数目少于标题数目,因此只保存文章标题和链接 print("title_list: " + str(len(title_list))) print("date_list: " + str(len(date_list))) opeExcel.write_excel_xls_append_2(file_name, author_name, title_list, link_list) allNumber = len(title_list) + allNumber
def getCSDNAuthorInfo(): #下列为CSDN博客页面的xpath,例如https://blog.csdn.net/luoshengyang/article/list/1 type_xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/span/text()" title_xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/text()" link_xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/@href" publishDate_xpath = "//div[@class='info-box d-flex align-content-center']/p/span[@class='date']/text()" readerCount_xpath = "//div[@class='info-box d-flex align-content-center']/p//span[last()-1][@class='read-num']/text()" commentCount_xpath = "//div[@class='info-box d-flex align-content-center']/p//span[last()][@class='read-num']/text()" # 写入Excel文件的表头数据,即第一行数据 # 博主名字 author_name = input("请输入博主的名字: ") # Sheet列数 col_num = int(input("请输入Sheet列数: ")) if col_num == 2: print("2列") headerData = [ ["文章标题", "文章链接"], ] elif col_num == 3: print("3列") headerData = [ ["文章标题", "文章链接", "发表日期"], ] elif col_num == 6: print("6列") headerData = [ ["文章类型", "文章标题", "文章链接", "发表日期", "阅读数", "评论数"], ] else: print("列数不对,函数返回!") return # 博主博文页数 page_num = 999999 # page_num = int(input("请输入博客页数: ")) # Excel文件名称 file_name = os.getcwd() + "\CSDN_articles.xls" opeExcel.create_excel_sheet(file_name, author_name) opeExcel.write_excel_xls_append(file_name, author_name, headerData) # 总体数组 title_sum = [] link_sum = [] publishDate_sum = [] type_sum = [] readerCount_sum = [] commentCount_sum = [] for index in range(1, page_num + 1): # 拼接URL page_url = "https://blog.csdn.net/" + author_name + "/article/list/" + str( index) page_html = opeHTML.getEtreeHTML(page_url) # 博客文章的标题 title_list = opeHTML.parseEtreeHTML(page_html, title_xpath) if len(title_list) == 0: break # 博客文章的链接 link_list = opeHTML.parseEtreeHTML(page_html, link_xpath) # 博客文章的发布日期 publishDate_list = opeHTML.parseEtreeHTML(page_html, publishDate_xpath) # 博客文章的类型 type_list = opeHTML.parseEtreeHTML(page_html, type_xpath) # 博客文章的阅读数 readerCount_list = opeHTML.parseEtreeHTML(page_html, readerCount_xpath) # 博客文章的评论数 commentCount_list = opeHTML.parseEtreeHTML(page_html, commentCount_xpath) # 将所有内容存放到一个总的数组中 title_sum.extend(title_list) link_sum.extend(link_list) publishDate_sum.extend(publishDate_list) type_sum.extend(type_list) readerCount_sum.extend(readerCount_list) commentCount_sum.extend(commentCount_list) # 数据写入 if len(title_sum) > 0: if col_num == 2: print("2列") opeExcel.write_excel_xls_append_2(file_name, author_name, title_sum, link_sum) elif col_num == 3: opeExcel.write_excel_xls_append_3(file_name, author_name, title_sum, link_sum, publishDate_sum) elif col_num == 6: opeExcel.write_excel_xls_append_6(file_name, author_name, type_sum, title_sum, link_sum, publishDate_sum, readerCount_sum, commentCount_sum) # 储存完毕数据一次性打印数据个数 print(author_name + "文章获取完毕,共计文章数目:" + str(len(title_sum)))