Beispiel #1
0
def getTag():
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl))
    tagContent = htmlContent.xpath('.//div[@class="d_tags"]/a')

    for tag in tagContent:
        url = baseUrl + str(tag.xpath('./@href')[0])
        QklDbUtli.insertTag(tag.text)
Beispiel #2
0
def getNewsDetail(news):
    time.sleep(2)
    url = newsUrl % (news['newsId'])
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url))

    if htmlContent.xpath('.//div[@class="content-wrap"]') != None and len(
            htmlContent.xpath('.//div[@class="content"]')) > 0:
        detail = str(
            htmlContent.xpath("string(.//article[@class='article-content'])"))
        news['newsDetail'] = detail
        # 作者名字
        news['authorName'] = htmlContent.xpath(
            './/div[@class="meta"]/span')[1].text
        news['authorDesc'] = ""
        # 查看数
        news['newsWatch'] = "".join(
            list(
                filter(
                    str.isdigit,
                    str(htmlContent.xpath('.//span[@class="muted"]/text()')))))
        # 时间
        news['newsTime'] = htmlContent.xpath(
            './/div[@class="meta"]/time')[0].text
        QklDbUtli.insertQklNews(news)
        print(news['newsTitle'])
Beispiel #3
0
def getAuthorNewsDetails(author):
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(author['href']))
    if len(htmlContent.xpath('.//div[@class="content"]')) > 0:
        detail = str(
            htmlContent.xpath("string(.//article[@class='article-content'])"))
        author['newsDetail'] = detail.strip()
        QklDbUtli.insertQklAuthorsNews(author)
        print("标题:" + author['newsTitle'])
Beispiel #4
0
def getNewsTagDetail(news):
    time.sleep(2)
    url = newsUrl % (news['newsId'])
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url))

    if len(htmlContent.xpath('.//div[@class="content"]')) > 0:
        detail = str(
            htmlContent.xpath("string(.//article[@class='article-content'])"))
        news['newsDetail'] = detail
        QklDbUtli.insertQklNews(news)
        print(news['newsTitle'])
Beispiel #5
0
# 区块链首页 爬虫
import json, time
from lxml import etree
from dao import QklDbUtli
from utils import UrlUtil

baseUrl = 'https://www.55coin.com'
newsUrl = "https://www.55coin.com/article/%s.html"
QklDbUtli.createQklNewsTable()
QklDbUtli.createQklTagTable()


def getNewsDetail(news):
    time.sleep(2)
    url = newsUrl % (news['newsId'])
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url))

    if htmlContent.xpath('.//div[@class="content-wrap"]') != None and len(
            htmlContent.xpath('.//div[@class="content"]')) > 0:
        detail = str(
            htmlContent.xpath("string(.//article[@class='article-content'])"))
        news['newsDetail'] = detail
        # 作者名字
        news['authorName'] = htmlContent.xpath(
            './/div[@class="meta"]/span')[1].text
        news['authorDesc'] = ""
        # 查看数
        news['newsWatch'] = "".join(
            list(
                filter(
                    str.isdigit,
Beispiel #6
0
# 期货机构的爬虫页面
import json, time
from lxml import etree
from dao import QklDbUtli
from utils import UrlUtil

# 初始化区块链资讯表
QklDbUtli.createQklNewsTable()
QklDbUtli.createQkl7X24NewsTable()
QklDbUtli.createQklAuthorNesTable()

cProxy = {'http': '177.69.203.66:3128'}

typeNum = [0, 21, 22, 23, 25, 27, 29, 30, 32]
baseUrl = "https://www.55coin.com/index/article/search.html?cat_id=%d&page=1"
newsUrl = "https://www.55coin.com/article/%s.html"
newsAuthorListUrl = "https://www.55coin.com/column.html"
newsAuthorDetailUrl = "https://www.55coin.com/author/%s.html"
news24Url = 'https://www.55coin.com/index/flash/search_query.html?cat_id=0&page=1'
news7x24Detail = 'https://www.55coin.com/flash/%s.html'
baseUrlNews = "https://www.55coin.com/index/article/search.html?flash_cat_id=0&page=1"

header = {
    'User-Agent':
    'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Host': 'www.55coin.com',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh,zh-CN;q=0.9,en;q=0.8',
    'Connection': 'keep-alive',
    'Referer': "https://www.55coin.com/",