def getTag(): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl)) tagContent = htmlContent.xpath('.//div[@class="d_tags"]/a') for tag in tagContent: url = baseUrl + str(tag.xpath('./@href')[0]) QklDbUtli.insertTag(tag.text)
def getNewsDetail(news): time.sleep(2) url = newsUrl % (news['newsId']) htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url)) if htmlContent.xpath('.//div[@class="content-wrap"]') != None and len( htmlContent.xpath('.//div[@class="content"]')) > 0: detail = str( htmlContent.xpath("string(.//article[@class='article-content'])")) news['newsDetail'] = detail # 作者名字 news['authorName'] = htmlContent.xpath( './/div[@class="meta"]/span')[1].text news['authorDesc'] = "" # 查看数 news['newsWatch'] = "".join( list( filter( str.isdigit, str(htmlContent.xpath('.//span[@class="muted"]/text()'))))) # 时间 news['newsTime'] = htmlContent.xpath( './/div[@class="meta"]/time')[0].text QklDbUtli.insertQklNews(news) print(news['newsTitle'])
def getAuthorNewsDetails(author): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(author['href'])) if len(htmlContent.xpath('.//div[@class="content"]')) > 0: detail = str( htmlContent.xpath("string(.//article[@class='article-content'])")) author['newsDetail'] = detail.strip() QklDbUtli.insertQklAuthorsNews(author) print("标题:" + author['newsTitle'])
def getNewsTagDetail(news): time.sleep(2) url = newsUrl % (news['newsId']) htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url)) if len(htmlContent.xpath('.//div[@class="content"]')) > 0: detail = str( htmlContent.xpath("string(.//article[@class='article-content'])")) news['newsDetail'] = detail QklDbUtli.insertQklNews(news) print(news['newsTitle'])
# 区块链首页 爬虫 import json, time from lxml import etree from dao import QklDbUtli from utils import UrlUtil baseUrl = 'https://www.55coin.com' newsUrl = "https://www.55coin.com/article/%s.html" QklDbUtli.createQklNewsTable() QklDbUtli.createQklTagTable() def getNewsDetail(news): time.sleep(2) url = newsUrl % (news['newsId']) htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url)) if htmlContent.xpath('.//div[@class="content-wrap"]') != None and len( htmlContent.xpath('.//div[@class="content"]')) > 0: detail = str( htmlContent.xpath("string(.//article[@class='article-content'])")) news['newsDetail'] = detail # 作者名字 news['authorName'] = htmlContent.xpath( './/div[@class="meta"]/span')[1].text news['authorDesc'] = "" # 查看数 news['newsWatch'] = "".join( list( filter( str.isdigit,
# 期货机构的爬虫页面 import json, time from lxml import etree from dao import QklDbUtli from utils import UrlUtil # 初始化区块链资讯表 QklDbUtli.createQklNewsTable() QklDbUtli.createQkl7X24NewsTable() QklDbUtli.createQklAuthorNesTable() cProxy = {'http': '177.69.203.66:3128'} typeNum = [0, 21, 22, 23, 25, 27, 29, 30, 32] baseUrl = "https://www.55coin.com/index/article/search.html?cat_id=%d&page=1" newsUrl = "https://www.55coin.com/article/%s.html" newsAuthorListUrl = "https://www.55coin.com/column.html" newsAuthorDetailUrl = "https://www.55coin.com/author/%s.html" news24Url = 'https://www.55coin.com/index/flash/search_query.html?cat_id=0&page=1' news7x24Detail = 'https://www.55coin.com/flash/%s.html' baseUrlNews = "https://www.55coin.com/index/article/search.html?flash_cat_id=0&page=1" header = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Host': 'www.55coin.com', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh,zh-CN;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Referer': "https://www.55coin.com/",