예제 #1
0
def getQHRBArticleList(articleCol, BeCrawledUrlList):
    #农产品
    url1 = 'http://www.qhrb.com.cn/farm/'
    #金属
    url2 = 'http://www.qhrb.com.cn/metal/'
    #能源化工
    url3 = 'http://www.qhrb.com.cn/energy/'
    #实市场告
    url4 = 'http://www.qhrb.com.cn/comment/scbg/'

    for url in (url1, url2, url3, url4):
        r = requests.get(url)
        r.encoding = 'utf8'  #不做任何设置的时候,正确
        selector = etree.HTML(r.text)
        #之前好几年的内容放在一页,只取5*12条
        eleList = selector.cssselect(".list-point li.item")[:600]
        temp_article_ls = []
        for ele in eleList:
            # time.sleep(0.5)
            articleUrl = ele.xpath("./a/@href")[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath('./a/text()')[0]
            try:
                publicTime = parseQHRBArtPubTime(ele.xpath('./span/text()')[0])
            except:
                print(url, '   ', title, '   找不到时间字符串')

            temp_dict = {'tags': ['期货日报'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = '期货日报'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseQHRBArtContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)

            if n:
                temp_dict['product_name'] = n
                print(SPIDERNAME, '   ', title, "    ", n)
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")

                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)

        #注意缩进不要错
        HandleTmpList(temp_article_ls, articleCol, SPIDERNAME)
def getMysteelArticleList(articleCol, BeCrawledUrlList):
    mysteelFarmingUrl = 'https://news.mysteel.com/article/p-3816-------------1.html'
    #有色
    mysteelNonferrousUrl = 'https://news.mysteel.com/article/p-2480-------------1.html'
    mysteelBlackmetalUrl = 'https://news.mysteel.com/article/p-3822-------------1.html'
    #能源化工
    EnergyAndChemical = 'https://news.mysteel.com/article/p-3823-------------1.html'

    temp_article_ls = []
    for url in (mysteelBlackmetalUrl, mysteelFarmingUrl, mysteelNonferrousUrl,
                EnergyAndChemical):
        r = requests.get(url)
        # r.encoding='gb2313'    #不做任何设置的时候,正确
        selector = etree.HTML(r.text)
        eleList = selector.xpath("//ul[@id='news']/li")
        for ele in eleList:
            articleUrl = 'https:' + ele.xpath('./h3/a/@href')[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath('./h3/a/text()')[0]
            try:
                publicTime = parseMysteelArtPubTime(ele.xpath('./p/text()')[0])
            except:
                print(url, '   ', title, '   找不到时间字符串')
            temp_dict = {'tags': ['mysteel'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'mysteel'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseMystellArtContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)

            if n:
                temp_dict['product_name'] = n
                print(SPIERNAME, '   ', title, '     ', n)
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")

                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '我的钢铁网')
예제 #3
0
def getJinrongjieArticleList(articleCol, BeCrawledUrlList):
    #金属
    url1 = 'http://futures.jrj.com.cn/list/jszx.shtml'
    #能源化工
    url2 = 'http://futures.jrj.com.cn/list/nyhgzx.shtml'
    #农产品
    url3 = 'http://futures.jrj.com.cn/list/ncpzx.shtml'

    temp_article_ls = []

    for url in (url1, url2, url3):
        r = requests.get(url)
        r.encoding = 'gbk'  #不做任何设置的时候,正确
        selector = etree.HTML(r.text)
        eleList = selector.xpath("//ul[@class='jrj-l1 tab-ts jrj-f14']/li")
        for ele in eleList:
            try:
                articleUrl = ele.xpath('./label/a/@href')[0]
            except IndexError as e:
                print('跳过空行')
                continue
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath('./label/a/@title')[0]
            publicTime = parseJinrongjiePubTime(
                ele.xpath('./label/i/text()')[0])
            temp_dict = {'tags': ['Jinrongjie'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'Jinrongjie'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseJinrongjieContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIDERNAME, '   ', title, "  ", n)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, SPIDERNAME)
예제 #4
0
def getSMMArticleList(articleCol, BeCrawledUrlList):
    #要闻
    url1 = 'https://news.smm.cn'

    #[{'title':'      ','url':'      ','publicTime':'      ','tags':[],'score':0},]
    #历史爬取的记录

    temp_article_ls = []
    for url in (url1, ):
        r = requests.get(url)
        r.encoding = 'utf8'
        selector = etree.HTML(r.text)
        eleList = selector.cssselect(".news-main-list>ul>li")
        for ele in eleList:
            articleUrl = 'https://news.smm.cn' + ele.xpath('./div/a/@href')[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath('./div/a/h3/@title')[0]
            # print(title)
            # continue
            publicTime = parseSMMArtPubTime(
                ele.xpath(
                    './div/div[@class="news-list-content-label"]/p/label[@class="news-list-time-label"]/text()'
                )[0])
            temp_dict = {'tags': ['SMM'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'SMM'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseSMMArtContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIDERNAME, '   ', title, "    ", n, '   ', publicTime)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, SPIDERNAME)
예제 #5
0
def getEastMoneyArticleList(articleCol, BeCrawledUrlList):
    url_1 = 'http://futures.eastmoney.com/a/cqhdd.html'  #期货导读
    url_2 = 'http://futures.eastmoney.com/news/cjdgc.html'  #焦点观察
    url_3 = 'http://futures.eastmoney.com/news/cqspl.html'  #内盘评论
    url_4 = 'http://futures.eastmoney.com/news/cwpsd.html'  #外盘速递
    url_5 = 'http://futures.eastmoney.com/news/cqsyw.html'  #期市聚焦

    temp_article_ls = []

    for url in (url_1, url_2, url_3, url_4, url_5):
        r = requests.get(url)
        # r.encoding='gb2313'
        selector = etree.HTML(r.text)
        eleList = selector.xpath(
            "//ul[@id='newsListContent']/li/div[@class='text']")
        for ele in eleList:
            articleUrl = ele.xpath("./p[@class='title']/a/@href")[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath("./p[@class='title']/a/text()")[0].strip()
            publicTime = parseEastMoneyPubTime(
                ele.xpath("./p[@class='time']/text()")[0])
            temp_dict = {'tags': ['eastmoney'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'eastmoney'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseEastMoneyContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIERNAME, '   ', title, '     ', n)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''
            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '东方财富')
def getSinaArticleList(articleCol, BeCrawledUrlList):
    sinaFarmingProductUrl = 'http://finance.sina.com.cn/roll/index.d.html?lid=1006'
    sinaIndustryProductUrl = 'http://finance.sina.com.cn/roll/index.d.html?lid=1005'
    sinaEnergyProductUrl = 'http://finance.sina.com.cn/roll/index.d.html?lid=1007'

    #[{'title':'      ','url':'      ','publicTime':'      ','tags':[],'score':0},]
    #历史爬取的记录

    temp_article_ls = []
    for url in (sinaEnergyProductUrl, sinaFarmingProductUrl,
                sinaIndustryProductUrl):
        r = requests.get(url)
        r.encoding = 'utf8'
        selector = etree.HTML(r.text)
        eleList = selector.xpath("//ul[@class='list_009']/li")
        for ele in eleList:
            articleUrl = ele.xpath('./a/@href')[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath('./a/text()')[0]
            publicTime = parseSinaArtPubTime(ele.xpath('./span/text()')[0])
            temp_dict = {'tags': ['sina'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'sina'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseSinaArtContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIERNAME, '   ', title, '     ', n)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '新浪期货')
def getJinTouArticleLs(articleCol, BeCrawledUrlList):
    url = 'https://futures.cngold.org/zhzx/'

    temp_article_ls = []
    for url in (url, ):
        r = requests.get(url)
        r.encoding = 'utf-8'
        selector = etree.HTML(r.text)
        #列表里的每一个item
        eleList = selector.cssselect(".list_article ul li")
        for ele in eleList:
            articleUrl = ele.xpath("./div[@class='tit']/a/@href")[0]
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath("./div[@class='tit']/a/text()")[0]
            #btm clearfix
            publicTime = parseJinTouTimeStr(
                ele.xpath(
                    "./div[@class='btm clearfix']/span[@class='pubtime']/text()"
                )[0])
            temp_dict = {'tags': ['cngold'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'cngold'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseCnGolgContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIDERNAME, '   ', title, "  ", n)
                temp_dict['product_name'] = n
                #品种映射板块
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''
            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '金投网')
예제 #8
0
def getAskCiArticleList(articleCol,BeCrawledUrlList):
    Url='http://www.askci.com/news/chanye/'

    temp_article_ls=[]

    for url in (Url,):
        r=requests.get(url)
        r.encoding='utf8'    #不做任何设置的时候,正确
        selector=etree.HTML(r.text)
        eleList=selector.cssselect(".list_box1 ul li")
        for ele in  eleList:
            articleUrl=ele.xpath('./a/@href')[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList:break
            title=ele.xpath('./a/@title')[0]
            publicTime=ele.xpath('./div/div/div[@class="list_box1_time"]/text()')[0]
            temp_dict={'tags':['AskCi'],'score':0,'uid':UID()}
            temp_dict['title']=title.strip()
            temp_dict['articleFrom']='AskCi'
            temp_dict['url']=articleUrl.strip()
            temp_dict['publicTime']=publicTime.strip()
             #文章内容
            content=parseAskCiContent(articleUrl)
            temp_dict['content']=content
            #定文章所属期货品种,板块
            n=parseContentToName(title+content)
            if n:
                print(SPIDERNAME,'   ',title,"  ",n)
                temp_dict['product_name']=n
                temp_dict['group']=ProductToGroup[n]
                #找根据文章内容找不到品种的文章,跳过
                temp_article_ls.append(temp_dict)
            else:
                print("………………………………未找到品种名称,可能异常")

            
            
    
     #注意缩进不要错
    HandleTmpList(temp_article_ls,articleCol,SPIDERNAME)
예제 #9
0
def getChinaGrainArticleList(articleCol, BeCrawledUrlList):
    Url = 'http://www.chinagrain.cn/analytics/'

    temp_article_ls = []

    for url in (Url, ):
        r = requests.get(url)
        # r.encoding='gb2313'    #不做任何设置的时候,正确
        selector = etree.HTML(r.text)
        eleList = selector.xpath("//ul[@id='list']/li")
        for ele in eleList:
            articleUrl = ele.xpath('./a/@href')[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath('./a/h2/text()')[0]
            publicTime = parseChinaGrainPubTime(
                ele.xpath('./span[2]/text()')[0])
            temp_dict = {'tags': ['chinagrain'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'chinagrain'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseChinaGrainContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIERNAME, '   ', title, '     ', n)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '中国粮油信息网')
예제 #10
0
def getYunkenArticleList(articleCol, BeCrawledUrlList):
    yunkenUrl = 'https://www.yunken.com/?cat=7'

    temp_article_ls = []
    for url in (yunkenUrl, ):
        r = requests.get(url)
        r.encoding = 'utf8'
        selector = etree.HTML(r.text)
        eleList = selector.xpath("//section[2]//article/header")
        for ele in eleList:
            articleUrl = ele.xpath('./h3/a/@href')[0]
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = ele.xpath('./h3/a/text()')[0]
            publicTime = parseYunkenArtPubTime(
                ele.xpath('./div/time/@datetime')[0])
            temp_dict = {'tags': ['天然橡胶网', '橡胶'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'yunken'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseYunkenArtContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIERNAME, '   ', title, '     ', n)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)
    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '天然橡胶网')
예제 #11
0
def getHexunArticleList(articleCol, BeCrawledUrlList):
    #农副
    hexunFarmingReqID = '101065616'
    #有色
    hexunMetalReqID = '101065619'
    #能源
    hexunEnergyReqID = '130519488'
    #化工
    hexunChemicalReqID = '130518597'

    temp_article_ls = []
    header = {
        'Referer':
        'http://futures.hexun.com/agriculturenews/',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }
    url = 'http://open.tool.hexun.com/MongodbNewsService/newsListPageByJson.jsp'
    for ID in (hexunFarmingReqID, hexunMetalReqID, hexunEnergyReqID,
               hexunChemicalReqID):
        param = {
            'id': ID,
            's': '30',
            'cp': '1',
            'priority': '0',
            'callback': 'hx_json1%d' % (int(time.time() * 1000))
        }
        r = requests.get(url, params=param, headers=header)
        r.encoding = 'gb2312'  #不做任何设置的时候,正确
        # print(r.text)
        # break
        # selecattor=etree.HTML(r.text)
        # eleList=selector.cssselect("div#temp01 ul li")
        # print('123',eleList)
        article_ls = json.loads(re.search('{.+}', r.text).group())['result']
        for item in article_ls:
            articleUrl = item['entityurl']
            #判断是否已经爬取过,如果是,跳出循环
            if articleUrl in BeCrawledUrlList: break
            title = item['title']
            publicTime = parseHexunArtPubTime(item['entitytime'])
            temp_dict = {'tags': ['hexun'], 'score': 0, 'uid': UID()}
            temp_dict['title'] = title.strip()
            temp_dict['articleFrom'] = 'hexun'
            temp_dict['url'] = articleUrl.strip()
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseHexunContent(articleUrl)
            temp_dict['content'] = content
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)

            if n:
                print(SPIERNAME, '   ', title, '     ', n)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''
            # print(temp_dict)
            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '和讯财经')
예제 #12
0
def getTouTiaoArticleLs(articleCol, BeCrawledUrlList):
    sess = requests.session()
    searchUrl = 'https://www.toutiao.com/api/search/content/'
    temp_article_ls = []
    for productName in ProductNameTuple:
        param = {
            'aid': '24',
            'app_name': 'web_search',
            'offset': '20',
            'format': 'json',
            'keyword': productName,
            'autoload': 'true',
            'count': '20',
            'en_qc': '1',
            'cur_tab': '1',
            'from': 'search_tab',
            'pd': 'synthesis',
            'timestamp': str(int(time.time() * 1000))
        }
        sess.headers = {
            'cookie':
            'tt_webid=6767227851205821960; WEATHER_CITY={}; tt_webid=6767227851205821960; csrftoken=a36cb44b8e05ea4ad645dff6911d86cd; s_v_web_id=7e37f07e972f8c6f1b40a031bb6da223; __tasessionId=vyaf0l09q{}'
            .format(parse.quote('武汉'), str(int(time.time() * 1000))),
            'referer':
            'https://www.toutiao.com/search/?keyword={}'.format(
                parse.quote(productName)),
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
        }
        r = sess.get(searchUrl, params=param)
        temp_ls = r.json()['data']
        for item in temp_ls:
            try:
                articleUrl = 'https://www.toutiao.com/a%s/' % item['item_id']
            except KeyError as e:
                print(e)
                print('取不到字段,可能有错,跳过')
                continue

            if articleUrl in BeCrawledUrlList: continue
            temp_dict = {'tags': ['toutiao'], 'score': 0, 'uid': UID()}
            title = item['title']
            temp_dict['articleFrom'] = 'toutiao'
            temp_dict['url'] = articleUrl.strip()
            publicTime = parseTouTiaoArtPubTime(item['publish_time'])
            temp_dict['publicTime'] = publicTime.strip()
            #文章内容
            content = parseTouTiaoArtContent(articleUrl, sess)
            #定文章所属期货品种,板块
            n = parseContentToName(title + content)
            if n:
                print(SPIDERNAME, '   ', title, "    ", n)
                temp_dict['product_name'] = n
                temp_dict['group'] = ProductToGroup[n]
            else:
                print("………………………………未找到品种名称,可能异常")
                temp_dict['product_name'] = ''
                temp_dict['group'] = ''

            temp_article_ls.append(temp_dict)

    #注意缩进不要错
    HandleTmpList(temp_article_ls, articleCol, '今日头条')