def getGpListNews(type, url): htmlContent = etree.HTML(UrlUtil.parse_url(url)) if htmlContent.xpath(".//div[@class='repeatList']") != None and len( htmlContent.xpath(".//div[@class='repeatList']") ) > 0 and htmlContent.xpath(".//div[@class='repeatList']")[0].xpath( './/ul/li') != None: content = htmlContent.xpath(".//div[@class='repeatList']")[0].xpath( './/ul/li') orglists = [] for div in content: org = {} href = str( div.xpath('.//p[@class="title"]')[0].xpath('.//a/@href')[0]) org["href"] = href if len(div.xpath('.//div/a/img/@src')) > 0: icon = str(div.xpath('.//div/a/img/@src')[0]) org["icon"] = 'http:' + icon else: org["icon"] = "" title = div.xpath('.//p[@class="title"]')[0].xpath('.//a')[0].text org["title"] = title desc = div.xpath('.//p[@class="info"]')[0].text org["desc"] = desc time = div.xpath('.//p[@class="time"]')[0].text org["time"] = time org["type"] = type orglists.append(org) for org in orglists: getDetailInfo(org)
def getTag(): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl)) tagContent = htmlContent.xpath('.//div[@class="d_tags"]/a') for tag in tagContent: url = baseUrl + str(tag.xpath('./@href')[0]) QklDbUtli.insertTag(tag.text)
def getTagNews(): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl)) tagContent = htmlContent.xpath('.//div[@class="d_tags"]/a') for tag in tagContent: url = baseUrl + str(tag.xpath('./@href')[0]) getNewsListByType(url, tag.text)
def getNewsDetail(news): time.sleep(2) url = newsUrl % (news['newsId']) htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url)) if htmlContent.xpath('.//div[@class="content-wrap"]') != None and len( htmlContent.xpath('.//div[@class="content"]')) > 0: detail = str( htmlContent.xpath("string(.//article[@class='article-content'])")) news['newsDetail'] = detail # 作者名字 news['authorName'] = htmlContent.xpath( './/div[@class="meta"]/span')[1].text news['authorDesc'] = "" # 查看数 news['newsWatch'] = "".join( list( filter( str.isdigit, str(htmlContent.xpath('.//span[@class="muted"]/text()'))))) # 时间 news['newsTime'] = htmlContent.xpath( './/div[@class="meta"]/time')[0].text QklDbUtli.insertQklNews(news) print(news['newsTitle'])
def getNewsList24(): content = UrlUtil.parse_url_get_proxy(news24Url) content = json.loads(content) list = content['list'] listNews = [] newsType = [] for data in list: news = {} # 文章id article_id = data['flash_id'] news['newsId'] = str(article_id) # 标题 title = data['title'] news['newsTitle'] = title # 简介 brief = data['brief'] news['newsDesc'] = brief news['newsType'] = '24小时' # 时间 add_time = data['add_time'] news['newsTime'] = str(add_time) rise = data['rise'] news['newsSupport'] = str(rise) fall = data['fall'] news['newsSupportNo'] = str(fall) listNews.append(news) # 获取json数据 for news in listNews: getNews7x24Detail(news)
def getAuthorNewsDetails(author): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(author['href'])) if len(htmlContent.xpath('.//div[@class="content"]')) > 0: detail = str( htmlContent.xpath("string(.//article[@class='article-content'])")) author['newsDetail'] = detail.strip() QklDbUtli.insertQklAuthorsNews(author) print("标题:" + author['newsTitle'])
def getDetailInfo(org): time.sleep(2) href = org["href"] detailContent = etree.HTML(UrlUtil.parse_url(href)) if detailContent.xpath('.//div[@class="Body"]') != None and len( detailContent.xpath('.//div[@class="Body"]')) > 0: content = str(detailContent.xpath("string(.//div[@class='Body'])")) # org["news_content"] = content GpDbUtli.insertGpNews(org["title"].strip(), org["icon"].strip(), org["type"].strip(), org["desc"].strip(), content.strip(), org["time"].strip())
def getNewsTagDetail(news): time.sleep(2) url = newsUrl % (news['newsId']) htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url)) if len(htmlContent.xpath('.//div[@class="content"]')) > 0: detail = str( htmlContent.xpath("string(.//article[@class='article-content'])")) news['newsDetail'] = detail QklDbUtli.insertQklNews(news) print(news['newsTitle'])
def getNewsListByType(url, tag): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url)) listContent = htmlContent.xpath('.//article[@class="excerpt"]') listNews = [] for data in listContent: news = {} # 文章id article_id = "".join( list(filter(str.isdigit, str(data.xpath('.//a/@href')[0])))) news['newsId'] = str(article_id) # 作者id news['authorId'] = "" # 作者名字 author_name = data.xpath('.//span[@class="muted"]')[0].text news['authorName'] = author_name news['authorDesc'] = "" news['authorIcon'] = "" # 标题 title = data.xpath('./header/h2/a')[0].text news['newsTitle'] = title # 简介 if data.xpath('./p') is None: news['newsDesc'] = '' else: news['newsDesc'] = data.xpath('./p')[0].text # 图片 if len(data.xpath('./div/a/img/@src') ) == 0 or data.xpath('./div/a/img/@src')[0] is None: news['newsIcon'] = "" else: rectangle_img = str(data.xpath('./div/a/img/@src')[0]) news['newsIcon'] = rectangle_img # 类型 news['newsType'] = tag # 查看数 show_total = str(data.xpath('.//span[@class="muted none"]/text()')[0]) news['newsWatch'] = str(show_total) # 时间 add_time = data.xpath('.//span[@class="muted"]')[1].text news['newsTime'] = str(add_time) if len(listNews) < 13: listNews.append(news) # 获取json数据 for news in listNews: getNewsTagDetail(news)
def getAuthorDetail(author): time.sleep(2) url = newsAuthorDetailUrl % (author['authorId']) htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url)) if len(htmlContent.xpath('.//article[@class="excerpt"]')) > 0: ulcontent = htmlContent.xpath('.//article[@class="excerpt"]') authorNews = [] author['authorFuns'] = htmlContent.xpath( './/ul[@class="data"]/li/span')[4].text author['authorNews'] = htmlContent.xpath( './/ul[@class="data"]/li/span')[0].text for news in ulcontent: if len(authorNews) < 16: authornew = {} authornew['authorFuns'] = author['authorFuns'] authornew['authorNews'] = author['authorNews'] authornew['authorId'] = author['authorId'] authornew['authorName'] = author['authorName'] authornew['authorDesc'] = author['authorDesc'] authornew['authorName'] = author['authorName'] authornew['authorIcon'] = author['authorIcon'] authornew['authorSupport'] = author['authorSupport'] authornew['newsTitle'] = str( news.xpath('./header/h2/a/@title')[0]) if news.xpath('./p')[0].text is None: authornew['newsDesc'] = '' else: authornew['newsDesc'] = news.xpath('./p')[0].text authornew['newsIcon'] = str(news.xpath('./div/a/img/@src')[0]) # 类型 authornew['newsType'] = news.xpath('./header/a')[0].text authornew['href'] = "https://www.55coin.com" + news.xpath( './div/a/@href')[0] authornew['newsId'] = "".join( list( filter(str.isdigit, str(news.xpath('./div/a/@href')[0])))) # 查看数 authornew['newsWatch'] = str( news.xpath('.//span[@class="muted none"]/text()')[0]) authornew['newsTime'] = str( news.xpath('.//span[@class="muted"]/text()')[1]) authorNews.append(authornew) for news in authorNews: print("") getAuthorNewsDetails(news)
def getNewsType(): htmlContent = etree.HTML(UrlUtil.parse_url(baseUrl)) typs = [] base = "http://finance.eastmoney.com" if len(htmlContent.xpath('.//ul[@id="daodu_header"]')) > 0: ul = htmlContent.xpath('.//ul[@id="daodu_header"]')[0] for li in ul: type = {} type["type"] = li.text type["href"] = str(li.xpath('./@data-href')[0]) typs.append(type) for type in typs: getGpListNews(type["type"], type["href"])
def getNewsListByType(url): content = UrlUtil.parse_url_get_proxy(url) content = json.loads(content) list = content['list'] listNews = [] newsType = [] for data in list: news = {} # 文章id article_id = data['article_id'] news['newsId'] = str(article_id) # 作者id editor_id = data['editor_id'] news['authorId'] = str(editor_id) # 作者名字 author_name = data['author_name'] news['authorName'] = author_name # 标题 title = data['title'] news['newsTitle'] = title # 简介 brief = data['brief'] news['newsDesc'] = brief # 图片 rectangle_img = data['rectangle_img'] news['newsIcon'] = rectangle_img # 类型 cat_name = data['cat_name'] news['newsType'] = cat_name if cat_name not in newsType: newsType.append(cat_name) # 查看数 show_total = data['show_total'] news['newsWatch'] = str(show_total) # 时间 add_time = data['add_time'] news['newsTime'] = str(add_time) if len(listNews) < 13: listNews.append(news) # 获取json数据 return listNews
def getNewsType(types): htmlContent = etree.HTML(UrlUtil.parse_url(baseUrl)) typs = [] base = "http://finance.eastmoney.com" if len(htmlContent.xpath('.//div[@id="box_pic"]')) > 0: ul = htmlContent.xpath('.//div[@id="box_pic"]')[0] uls = ul.xpath('.//ul/li') for li in uls: type = {} type["title"] = str(li.xpath('./a/@title')[0]) type["href"] = str(li.xpath('./a/@href')[0]) type["type"] = types type["icon"] = str(li.xpath('./a/img/@src')[0]) typs.append(type) for type in typs: getTopDetailInfo(type)
def getAuthorList(): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(newsAuthorListUrl)) if len(htmlContent.xpath('.//ul[@id="column_rank"]')) > 0: ulcontent = htmlContent.xpath('.//ul[@id="column_rank"]')[0] authorList = [] for data in ulcontent: author = {} author['authorId'] = str(data.xpath('./a/@user_id')[0]) author['authorName'] = data.xpath('./a/div/strong')[0].text if data.xpath('./a/div/span')[0].text is None: author['authorDesc'] = "" else: author['authorDesc'] = data.xpath('./a/div/span')[0].text author['authorIcon'] = str(data.xpath('./a/img/@src')[0]) author['authorSupport'] = str(data.xpath('./div')[0].text) authorList.append(author) for author in authorList: getAuthorDetail(author)
def getHomeInfo(): htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl)) hotContent = htmlContent.xpath(".//ul[@class='article-list']")[0] hotList = [] for hotNews in list(hotContent): news = {} # 文章id news['newsId'] = "".join( list(filter(str.isdigit, str(hotNews.xpath('./a/@href')[0])))) # 作者id news['authorId'] = "" # 作者名字 news['authorName'] = "" news['authorDesc'] = "" # 标题 if (len(hotNews.xpath('.//div[@class="tit"]')) > 0): news['newsTitle'] = hotNews.xpath('.//div[@class="tit"]')[0].text else: news['newsTitle'] = str(hotNews.xpath('.//div/a/@title')[0]) # 简介 news['newsDesc'] = "" # 图片 news['newsIcon'] = str(hotNews.xpath('.//img/@src')[0]) # 类型 news['newsType'] = "hot" # 查看数 news['newsWatch'] = "" # 时间 news['newsTime'] = "" # 详情 news['newsDetail'] = "" hotList.append(news) for news in hotList: getNewsDetail(news)