def getQHRBArticleList(articleCol, BeCrawledUrlList): #农产品 url1 = 'http://www.qhrb.com.cn/farm/' #金属 url2 = 'http://www.qhrb.com.cn/metal/' #能源化工 url3 = 'http://www.qhrb.com.cn/energy/' #实市场告 url4 = 'http://www.qhrb.com.cn/comment/scbg/' for url in (url1, url2, url3, url4): r = requests.get(url) r.encoding = 'utf8' #不做任何设置的时候,正确 selector = etree.HTML(r.text) #之前好几年的内容放在一页,只取5*12条 eleList = selector.cssselect(".list-point li.item")[:600] temp_article_ls = [] for ele in eleList: # time.sleep(0.5) articleUrl = ele.xpath("./a/@href")[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath('./a/text()')[0] try: publicTime = parseQHRBArtPubTime(ele.xpath('./span/text()')[0]) except: print(url, ' ', title, ' 找不到时间字符串') temp_dict = {'tags': ['期货日报'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = '期货日报' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseQHRBArtContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: temp_dict['product_name'] = n print(SPIDERNAME, ' ', title, " ", n) temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, SPIDERNAME)
def getMysteelArticleList(articleCol, BeCrawledUrlList): mysteelFarmingUrl = 'https://news.mysteel.com/article/p-3816-------------1.html' #有色 mysteelNonferrousUrl = 'https://news.mysteel.com/article/p-2480-------------1.html' mysteelBlackmetalUrl = 'https://news.mysteel.com/article/p-3822-------------1.html' #能源化工 EnergyAndChemical = 'https://news.mysteel.com/article/p-3823-------------1.html' temp_article_ls = [] for url in (mysteelBlackmetalUrl, mysteelFarmingUrl, mysteelNonferrousUrl, EnergyAndChemical): r = requests.get(url) # r.encoding='gb2313' #不做任何设置的时候,正确 selector = etree.HTML(r.text) eleList = selector.xpath("//ul[@id='news']/li") for ele in eleList: articleUrl = 'https:' + ele.xpath('./h3/a/@href')[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath('./h3/a/text()')[0] try: publicTime = parseMysteelArtPubTime(ele.xpath('./p/text()')[0]) except: print(url, ' ', title, ' 找不到时间字符串') temp_dict = {'tags': ['mysteel'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'mysteel' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseMystellArtContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: temp_dict['product_name'] = n print(SPIERNAME, ' ', title, ' ', n) temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '我的钢铁网')
def getJinrongjieArticleList(articleCol, BeCrawledUrlList): #金属 url1 = 'http://futures.jrj.com.cn/list/jszx.shtml' #能源化工 url2 = 'http://futures.jrj.com.cn/list/nyhgzx.shtml' #农产品 url3 = 'http://futures.jrj.com.cn/list/ncpzx.shtml' temp_article_ls = [] for url in (url1, url2, url3): r = requests.get(url) r.encoding = 'gbk' #不做任何设置的时候,正确 selector = etree.HTML(r.text) eleList = selector.xpath("//ul[@class='jrj-l1 tab-ts jrj-f14']/li") for ele in eleList: try: articleUrl = ele.xpath('./label/a/@href')[0] except IndexError as e: print('跳过空行') continue #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath('./label/a/@title')[0] publicTime = parseJinrongjiePubTime( ele.xpath('./label/i/text()')[0]) temp_dict = {'tags': ['Jinrongjie'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'Jinrongjie' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseJinrongjieContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIDERNAME, ' ', title, " ", n) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, SPIDERNAME)
def getSMMArticleList(articleCol, BeCrawledUrlList): #要闻 url1 = 'https://news.smm.cn' #[{'title':' ','url':' ','publicTime':' ','tags':[],'score':0},] #历史爬取的记录 temp_article_ls = [] for url in (url1, ): r = requests.get(url) r.encoding = 'utf8' selector = etree.HTML(r.text) eleList = selector.cssselect(".news-main-list>ul>li") for ele in eleList: articleUrl = 'https://news.smm.cn' + ele.xpath('./div/a/@href')[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath('./div/a/h3/@title')[0] # print(title) # continue publicTime = parseSMMArtPubTime( ele.xpath( './div/div[@class="news-list-content-label"]/p/label[@class="news-list-time-label"]/text()' )[0]) temp_dict = {'tags': ['SMM'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'SMM' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseSMMArtContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIDERNAME, ' ', title, " ", n, ' ', publicTime) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, SPIDERNAME)
def getEastMoneyArticleList(articleCol, BeCrawledUrlList): url_1 = 'http://futures.eastmoney.com/a/cqhdd.html' #期货导读 url_2 = 'http://futures.eastmoney.com/news/cjdgc.html' #焦点观察 url_3 = 'http://futures.eastmoney.com/news/cqspl.html' #内盘评论 url_4 = 'http://futures.eastmoney.com/news/cwpsd.html' #外盘速递 url_5 = 'http://futures.eastmoney.com/news/cqsyw.html' #期市聚焦 temp_article_ls = [] for url in (url_1, url_2, url_3, url_4, url_5): r = requests.get(url) # r.encoding='gb2313' selector = etree.HTML(r.text) eleList = selector.xpath( "//ul[@id='newsListContent']/li/div[@class='text']") for ele in eleList: articleUrl = ele.xpath("./p[@class='title']/a/@href")[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath("./p[@class='title']/a/text()")[0].strip() publicTime = parseEastMoneyPubTime( ele.xpath("./p[@class='time']/text()")[0]) temp_dict = {'tags': ['eastmoney'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'eastmoney' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseEastMoneyContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIERNAME, ' ', title, ' ', n) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '东方财富')
def getSinaArticleList(articleCol, BeCrawledUrlList): sinaFarmingProductUrl = 'http://finance.sina.com.cn/roll/index.d.html?lid=1006' sinaIndustryProductUrl = 'http://finance.sina.com.cn/roll/index.d.html?lid=1005' sinaEnergyProductUrl = 'http://finance.sina.com.cn/roll/index.d.html?lid=1007' #[{'title':' ','url':' ','publicTime':' ','tags':[],'score':0},] #历史爬取的记录 temp_article_ls = [] for url in (sinaEnergyProductUrl, sinaFarmingProductUrl, sinaIndustryProductUrl): r = requests.get(url) r.encoding = 'utf8' selector = etree.HTML(r.text) eleList = selector.xpath("//ul[@class='list_009']/li") for ele in eleList: articleUrl = ele.xpath('./a/@href')[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath('./a/text()')[0] publicTime = parseSinaArtPubTime(ele.xpath('./span/text()')[0]) temp_dict = {'tags': ['sina'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'sina' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseSinaArtContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIERNAME, ' ', title, ' ', n) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '新浪期货')
def getJinTouArticleLs(articleCol, BeCrawledUrlList): url = 'https://futures.cngold.org/zhzx/' temp_article_ls = [] for url in (url, ): r = requests.get(url) r.encoding = 'utf-8' selector = etree.HTML(r.text) #列表里的每一个item eleList = selector.cssselect(".list_article ul li") for ele in eleList: articleUrl = ele.xpath("./div[@class='tit']/a/@href")[0] if articleUrl in BeCrawledUrlList: break title = ele.xpath("./div[@class='tit']/a/text()")[0] #btm clearfix publicTime = parseJinTouTimeStr( ele.xpath( "./div[@class='btm clearfix']/span[@class='pubtime']/text()" )[0]) temp_dict = {'tags': ['cngold'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'cngold' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseCnGolgContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIDERNAME, ' ', title, " ", n) temp_dict['product_name'] = n #品种映射板块 temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '金投网')
def getAskCiArticleList(articleCol,BeCrawledUrlList): Url='http://www.askci.com/news/chanye/' temp_article_ls=[] for url in (Url,): r=requests.get(url) r.encoding='utf8' #不做任何设置的时候,正确 selector=etree.HTML(r.text) eleList=selector.cssselect(".list_box1 ul li") for ele in eleList: articleUrl=ele.xpath('./a/@href')[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList:break title=ele.xpath('./a/@title')[0] publicTime=ele.xpath('./div/div/div[@class="list_box1_time"]/text()')[0] temp_dict={'tags':['AskCi'],'score':0,'uid':UID()} temp_dict['title']=title.strip() temp_dict['articleFrom']='AskCi' temp_dict['url']=articleUrl.strip() temp_dict['publicTime']=publicTime.strip() #文章内容 content=parseAskCiContent(articleUrl) temp_dict['content']=content #定文章所属期货品种,板块 n=parseContentToName(title+content) if n: print(SPIDERNAME,' ',title," ",n) temp_dict['product_name']=n temp_dict['group']=ProductToGroup[n] #找根据文章内容找不到品种的文章,跳过 temp_article_ls.append(temp_dict) else: print("………………………………未找到品种名称,可能异常") #注意缩进不要错 HandleTmpList(temp_article_ls,articleCol,SPIDERNAME)
def getChinaGrainArticleList(articleCol, BeCrawledUrlList): Url = 'http://www.chinagrain.cn/analytics/' temp_article_ls = [] for url in (Url, ): r = requests.get(url) # r.encoding='gb2313' #不做任何设置的时候,正确 selector = etree.HTML(r.text) eleList = selector.xpath("//ul[@id='list']/li") for ele in eleList: articleUrl = ele.xpath('./a/@href')[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath('./a/h2/text()')[0] publicTime = parseChinaGrainPubTime( ele.xpath('./span[2]/text()')[0]) temp_dict = {'tags': ['chinagrain'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'chinagrain' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseChinaGrainContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIERNAME, ' ', title, ' ', n) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '中国粮油信息网')
def getYunkenArticleList(articleCol, BeCrawledUrlList): yunkenUrl = 'https://www.yunken.com/?cat=7' temp_article_ls = [] for url in (yunkenUrl, ): r = requests.get(url) r.encoding = 'utf8' selector = etree.HTML(r.text) eleList = selector.xpath("//section[2]//article/header") for ele in eleList: articleUrl = ele.xpath('./h3/a/@href')[0] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = ele.xpath('./h3/a/text()')[0] publicTime = parseYunkenArtPubTime( ele.xpath('./div/time/@datetime')[0]) temp_dict = {'tags': ['天然橡胶网', '橡胶'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'yunken' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseYunkenArtContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIERNAME, ' ', title, ' ', n) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '天然橡胶网')
def getHexunArticleList(articleCol, BeCrawledUrlList): #农副 hexunFarmingReqID = '101065616' #有色 hexunMetalReqID = '101065619' #能源 hexunEnergyReqID = '130519488' #化工 hexunChemicalReqID = '130518597' temp_article_ls = [] header = { 'Referer': 'http://futures.hexun.com/agriculturenews/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } url = 'http://open.tool.hexun.com/MongodbNewsService/newsListPageByJson.jsp' for ID in (hexunFarmingReqID, hexunMetalReqID, hexunEnergyReqID, hexunChemicalReqID): param = { 'id': ID, 's': '30', 'cp': '1', 'priority': '0', 'callback': 'hx_json1%d' % (int(time.time() * 1000)) } r = requests.get(url, params=param, headers=header) r.encoding = 'gb2312' #不做任何设置的时候,正确 # print(r.text) # break # selecattor=etree.HTML(r.text) # eleList=selector.cssselect("div#temp01 ul li") # print('123',eleList) article_ls = json.loads(re.search('{.+}', r.text).group())['result'] for item in article_ls: articleUrl = item['entityurl'] #判断是否已经爬取过,如果是,跳出循环 if articleUrl in BeCrawledUrlList: break title = item['title'] publicTime = parseHexunArtPubTime(item['entitytime']) temp_dict = {'tags': ['hexun'], 'score': 0, 'uid': UID()} temp_dict['title'] = title.strip() temp_dict['articleFrom'] = 'hexun' temp_dict['url'] = articleUrl.strip() temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseHexunContent(articleUrl) temp_dict['content'] = content #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIERNAME, ' ', title, ' ', n) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' # print(temp_dict) temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '和讯财经')
def getTouTiaoArticleLs(articleCol, BeCrawledUrlList): sess = requests.session() searchUrl = 'https://www.toutiao.com/api/search/content/' temp_article_ls = [] for productName in ProductNameTuple: param = { 'aid': '24', 'app_name': 'web_search', 'offset': '20', 'format': 'json', 'keyword': productName, 'autoload': 'true', 'count': '20', 'en_qc': '1', 'cur_tab': '1', 'from': 'search_tab', 'pd': 'synthesis', 'timestamp': str(int(time.time() * 1000)) } sess.headers = { 'cookie': 'tt_webid=6767227851205821960; WEATHER_CITY={}; tt_webid=6767227851205821960; csrftoken=a36cb44b8e05ea4ad645dff6911d86cd; s_v_web_id=7e37f07e972f8c6f1b40a031bb6da223; __tasessionId=vyaf0l09q{}' .format(parse.quote('武汉'), str(int(time.time() * 1000))), 'referer': 'https://www.toutiao.com/search/?keyword={}'.format( parse.quote(productName)), 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36' } r = sess.get(searchUrl, params=param) temp_ls = r.json()['data'] for item in temp_ls: try: articleUrl = 'https://www.toutiao.com/a%s/' % item['item_id'] except KeyError as e: print(e) print('取不到字段,可能有错,跳过') continue if articleUrl in BeCrawledUrlList: continue temp_dict = {'tags': ['toutiao'], 'score': 0, 'uid': UID()} title = item['title'] temp_dict['articleFrom'] = 'toutiao' temp_dict['url'] = articleUrl.strip() publicTime = parseTouTiaoArtPubTime(item['publish_time']) temp_dict['publicTime'] = publicTime.strip() #文章内容 content = parseTouTiaoArtContent(articleUrl, sess) #定文章所属期货品种,板块 n = parseContentToName(title + content) if n: print(SPIDERNAME, ' ', title, " ", n) temp_dict['product_name'] = n temp_dict['group'] = ProductToGroup[n] else: print("………………………………未找到品种名称,可能异常") temp_dict['product_name'] = '' temp_dict['group'] = '' temp_article_ls.append(temp_dict) #注意缩进不要错 HandleTmpList(temp_article_ls, articleCol, '今日头条')