def parse_item(self, response): if response.status == 200: try: data_list = json.loads('[' + response.body.decode() + ']') # print(data_list) except Exception as e: print(e) item = ErrorItem() item['code'] = 800 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['source'] = "互动易" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = e yield item else: try: for data in data_list[0]["results"]: item = QuestionsAnswersItem() item['source'] = 'cninfo' item['stockcode'] = data['stockCode'] item['pubDate'] = data['pubDate'] item['question'] = data['mainContent'] item['answer'] = data['attachedContent'] yield item except Exception as e: item = ErrorItem() item['code'] = 902 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['source'] = "cninfo" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = e yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['source'] = "互动易" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: try: data = json.loads('['+response.body.decode()+']') print(data) except Exception as e: print(e) item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 901 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "httpbin" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) return item else: try: for item in data: i = NewsItem() i['source'] = "httpbin" # print(item) i['pubDate'] = "" i['title'] = "" i['content'] = item['origin'] yield i except Exception as e: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 902 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "httpbin" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "httpbin" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: try: data = json.loads(response.body.decode()) except Exception as e: print(e) item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 901 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "e公司" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) return item else: try: for item in data['data']: i = NewsItem() i['source'] = "egs" # print(item) i['pubDate'] = item.get('pageTime', "") i['title'] = item.get('title', "") i['content'] = item.get('content', "") i['isRed'] = item.get('isRed', 0) yield i except Exception as e: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = 902 # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "e公司" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "e公司" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_page(self, response): if response.status == 200: lis = response.xpath('/html/body/div/div/ul/li') if lis is None or len(lis) == 0: item = ErrorItem() item['code'] = 801 # 出错的页面 item['url'] = response.url # 出错的时间 item['date'] = time.time() # 出错的网站 item['site'] = "中证网" # 出错的描述 item['desc'] = '未找到html元素' # 代码报出的错误 item['exception'] = '' return item try: for li in lis: item = NewsItem() item['source'] = "cs" # temp = li.xpath('./span/text()').get().strip() # 19-05-16 18:43 # temp = '20' + temp # d = datetime.datetime.strptime(temp, "%Y-%m-%d %H:%M") # t = d.timetuple() # timeStamp = int(time.mktime(t)) # # item['pubDate'] = timeStamp item['pubDate'] ='' item['title'] = li.xpath('./a/text()').get() url = r'http://www.cs.com.cn/sylm/jsbd/' + li.xpath('./a/@href').get() yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_item, dont_filter=True) except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "中证网" # 出错的描述 item['desc'] = '解析html元素错误' # 代码报出的错误 item['exception'] = str(e) yield item
def parse_item(self, response): if response.status == 200: try: item = response.meta['item'] item['content'] = response.xpath('//div[@class="article-t hidden"]/p/text()').get() # TODO item['isRed'] = 0 yield item except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "中证网" # 出错的描述 item['desc'] = '解析html元素错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "中证网" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: lis = response.xpath('//ul[@class="nf-list"]/li') if lis is None or len(lis) == 0: item = ErrorItem() item['code'] = 801 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证快讯" # 出错的描述 item['desc'] = '未找到html元素' # 代码报出的错误 item['exception'] = '' return item try: # 日期:2019年05月16日 # riqi = response.xpath('//div[@class="nf-head"]/p/text()').get().strip() for li in lis: item = NewsItem() item['source'] = "cnstock" # temp = li.xpath('./p[1]/text()').get() # 如:20:30 # temp = riqi + temp # 2019年05月16日20:30 # # d = datetime.datetime.strptime(temp, "%Y年%m月%d日%H:%M") # t = d.timetuple() # timeStamp = int(time.mktime(t)) # # item['pubDate'] = timeStamp item['pubDate'] = '' title_conent = li.xpath('./p[2]/a/text()').get() # 如: ''' 【压垮乐视网的最后一根稻草竟然是它!】15日,进入暂停上市状态第三天的乐视网披露,因乐视体育经营不利导致增资协议中的对赌条款失败,乐视体育股东之一的前海思拓提出的涉及回购融资股权的仲裁申请,得到了北京仲裁委员会的支持。 ''' item['title'] = (re.findall('【.*】', title_conent)[0]).replace('【', '').replace('】', '') item['content'] = re.findall('】.*', title_conent)[0].replace('】', '') item['isRed'] = 0 yield item except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证快讯" # 出错的描述 item['desc'] = '解析html标签错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证快讯" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: try: # 不是格式正确的json,一前一后需要加上'[' ']', data_list = json.loads('[' + response.body.decode() + ']') # print(data_list) except Exception as e: # print(e) item = ErrorItem() item['code'] = 901 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "选股宝" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: try: for data in data_list[0]["NewMsgs"]: item = NewsItem() # item['flag'] = 1 item['source'] = 'xuangubao' item['pubDate'] = data['UpdatedAtInSec'] item['title'] = data['Title'] item['content'] = data['Summary'] # TODO item['isRed'] = data['Impact'] yield item except Exception as e: # print(e) item = ErrorItem() item['code'] = 902 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "选股宝" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "选股宝" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: lis = response.xpath('//ul[@class="live-list"]/li') if lis is None or len(lis) == 0: item = ErrorItem() item['code'] = 801 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "每经网" # 出错的描述 item['desc'] = '未找到html元素' # 代码报出的错误 item['exception'] = '' return item try: riqi = response.xpath( '//p[@class="live"]/span/text()').getall() # 如:2019年05月20日 date = '' for temp in riqi: if "年" in temp: date = temp.replace("\n", "").replace("\n\r", "").replace( "\r\n", "").replace("\r", "").strip() break for li in lis: i = NewsItem() i['source'] = "nbd" timeStamp = '' try: temp = (li.xpath( './div[@class="li-title"]/p/span/text()').get()) # 如:17:44:42 temp = temp.replace("\n", "").replace("\n\r", "").replace( "\r\n", "").replace("\r", "").strip() temp = date + temp # 如:2019年05月16日 18:26:27 d = datetime.datetime.strptime(temp, "%Y年%m月%d日%H:%M:%S") t = d.timetuple() timeStamp = time.mktime(t) except Exception as e: print(e) i['pubDate'] = "" else: i['pubDate'] = timeStamp i['title'] = "" i['content'] = li.xpath( './div[@class="li-text"]/a/text()').get() # TODO i['isRed'] = 0 yield i except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "每经网" # 出错的描述 item['desc'] = '解析html标签错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "每经网" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: try: data_list = json.loads(response.body.decode()) # print(data_list) except Exception as e: print(e) item = ErrorItem() item['code'] = 800 # 出错的页面 item['url'] = response.url # 出错的时间 item['timestamp'] = time.time() # 出错的url item['site'] = "第一财经" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: try: for data in data_list: item = NewsItem() item['source'] = 'yicai' date = data['datekey'] + " " + data[ 'hm'] # 如:2019.05.16 20:43 # print('<<<<<<<<<<< ' + temp + ' >>>>>>>>>>>') d = datetime.datetime.strptime(date, "%Y.%m.%d %H:%M") t = d.timetuple() item['pubDate'] = int(time.mktime(t)) # print(item) title_conent = data['newcontent'] # 如:【传化智联:非公开发行股票方案到期失效】 传化智联5月16日晚间公告,公司于2017年度股东大会审议通过《关于公司非公开发行股票方案的议案》,因资本市场环境变化等因素,公司此次非公开发行股票事项尚未取得实质进展。目前,此次非公开发行股票方案到期自动失效。 ", item['title'] = re.findall('【.*】', title_conent)[0].replace( '【', '').replace('】', '') item['content'] = re.findall('】.*', title_conent)[0].replace( '】', '') # TODO item['isRed'] = 0 yield item except Exception as e: item = ErrorItem() item['code'] = 902 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "第一财经" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的url item['site'] = "第一财经" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: try: temp = re.findall('__NEXT_DATA__.*module', response.body.decode(), re.S)[0].replace( '__NEXT_DATA__ =', '').replace('__NEXT_DATA__', '').replace('module', '').strip() data_list = json.loads('[' + temp + ']') # print(data_list) except Exception as e: print(e) item = ErrorItem() item['code'] = 800 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "财联社" # 出错的描述 item['desc'] = '响应的json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: try: for data in data_list[0]["props"]['initialState'][ 'telegraph']['dataList']: item = NewsItem() item['source'] = 'cls' item['pubDate'] = data['modified_time'] item['title'] = data['title'] # if '【' in data['content'] and '】' in data['content']: item['content'] = re.findall( '】.*', data['content'])[0].replace("】", '') else: item['content'] = data['content'] # TODO item['isRed'] = 0 yield item except Exception as e: item = ErrorItem() item['code'] = 902 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "财联社" # 出错的描述 item['desc'] = '解析json数据错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "财联社" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item
def parse_item(self, response): if response.status == 200: divs = response.xpath('//div[@class="m_feed_item"]') if divs is None or len(divs) == 0: item = ErrorItem() item['code'] = 801 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证e互动" # 出错的描述 item['desc'] = '未找到html元素' # 代码报出的错误 item['exception'] = '' return item try: for div in divs: q_a = div.xpath( './/div[@class="m_feed_txt"]/text()').getall() # 得到的是问题 、回答 、再加一个''组成的数组 item = QuestionsAnswersItem() item['source'] = 'sseinfo' temp = div.xpath('.//div[@class="m_feed_txt"]/a/text()' ).get() # 如::中国化学(601117) item['stockcode'] = re.findall('\d{6}', temp)[0] # temp_list = div.xpath('.//div[@class="m_feed_from"]/span/text()').getall() # 得到的是问题的时间和回答的时间组成的数组 # 如: # 04月08日 18:35 # 16分钟前 #也可能是: # 04月08日 18:35 # 1小时前 # # num = int(re.findall('\d+', temp_list[1])[0]) # # d = '' # if '小时' in temp_list[1]: # 提取小时数 # d = datetime.datetime.now() - datetime.timedelta(hours=num) # if '分钟' in temp_list[1]: # 提取分钟数 # d = datetime.datetime.now() - datetime.timedelta(minutes=num) # # t = d.timetuple() # timeStamp = int(time.mktime(t)) # # item['timestamp'] = timeStamp item['pubDate'] = '' item['question'] = q_a[1] item['answer'] = q_a[2] yield item except Exception as e: item = ErrorItem() item['code'] = 802 # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证e互动" # 出错的描述 item['desc'] = '解析html标签错误' # 代码报出的错误 item['exception'] = str(e) yield item else: item = ErrorItem() item['code'] = response.status # 出错的页面 item['url'] = response.url # 出错的时间 # item['timestamp'] = time.time() # 出错的网站 item['site'] = "上证e互动" # 出错的描述 item['desc'] = '响应错误' # 代码报出的错误 item['exception'] = '' yield item