Ejemplo n.º 1
0
    def parse_item(self, response):

        if response.status == 200:
            try:
                data_list = json.loads('[' + response.body.decode() + ']')
                # print(data_list)
            except Exception as e:
                print(e)
                item = ErrorItem()
                item['code'] = 800
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的url
                item['source'] = "互动易"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] = e
                yield item
            else:
                try:
                    for data in data_list[0]["results"]:
                        item = QuestionsAnswersItem()
                        item['source'] = 'cninfo'
                        item['stockcode'] = data['stockCode']
                        item['pubDate'] = data['pubDate']
                        item['question'] = data['mainContent']
                        item['answer'] = data['attachedContent']

                        yield item
                except Exception as e:
                    item = ErrorItem()
                    item['code'] = 902
                    # 出错的页面
                    item['url'] = response.url
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的url
                    item['source'] = "cninfo"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] = e
                    yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的url
            item['source'] = "互动易"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 2
0
    def parse_item(self, response):

        if response.status == 200:
            try:
                data = json.loads('['+response.body.decode()+']')
                print(data)
            except Exception as e:
                print(e)
                item = ErrorItem()
                item['code'] = response.status
                # 出错的页面
                item['url'] = 901
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的url
                item['site'] = "httpbin"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                return item
            else:
                try:
                    for item in data:
                        i = NewsItem()
                        i['source'] = "httpbin"
                        # print(item)
                        i['pubDate'] = ""
                        i['title'] = ""
                        i['content'] = item['origin']
                        yield i
                except Exception as e:
                    item = ErrorItem()
                    item['code'] = response.status
                    # 出错的页面
                    item['url'] = 902
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的url
                    item['site'] = "httpbin"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] =  str(e)
                    yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的url
            item['site'] = "httpbin"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 3
0
 def parse_item(self, response):
     if response.status == 200:
         try:
             data = json.loads(response.body.decode())
         except Exception as e:
             print(e)
             item = ErrorItem()
             item['code'] = response.status
             # 出错的页面
             item['url'] = 901
             # 出错的时间
             # item['timestamp'] = time.time()
             # 出错的url
             item['site'] = "e公司"
             # 出错的描述
             item['desc'] = '响应的json数据错误'
             # 代码报出的错误
             item['exception'] = str(e)
             return item
         else:
             try:
                 for item in data['data']:
                     i = NewsItem()
                     i['source'] = "egs"
                     # print(item)
                     i['pubDate'] = item.get('pageTime', "")
                     i['title'] = item.get('title', "")
                     i['content'] = item.get('content', "")
                     i['isRed'] = item.get('isRed', 0)
                     yield i
             except Exception as e:
                 item = ErrorItem()
                 item['code'] = response.status
                 # 出错的页面
                 item['url'] = 902
                 # 出错的时间
                 # item['timestamp'] = time.time()
                 # 出错的url
                 item['site'] = "e公司"
                 # 出错的描述
                 item['desc'] = '解析json数据错误'
                 # 代码报出的错误
                 item['exception'] = str(e)
                 yield item
     else:
         item = ErrorItem()
         item['code'] = response.status
         # 出错的页面
         item['url'] = response.url
         # 出错的时间
         # item['timestamp'] = time.time()
         # 出错的url
         item['site'] = "e公司"
         # 出错的描述
         item['desc'] = '响应错误'
         # 代码报出的错误
         item['exception'] = ''
         yield item
Ejemplo n.º 4
0
    def parse_page(self, response):
        if response.status == 200:
            lis = response.xpath('/html/body/div/div/ul/li')
            if lis is None or len(lis) == 0:
                item = ErrorItem()
                item['code'] = 801
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                item['date'] = time.time()
                # 出错的网站
                item['site'] = "中证网"
                # 出错的描述
                item['desc'] = '未找到html元素'
                # 代码报出的错误
                item['exception'] = ''
                return item
            try:
                for li in lis:
                    item = NewsItem()
                    item['source'] = "cs"

                    # temp = li.xpath('./span/text()').get().strip()  # 19-05-16 18:43
                    # temp = '20' + temp
                    # d = datetime.datetime.strptime(temp, "%Y-%m-%d %H:%M")
                    # t = d.timetuple()
                    # timeStamp = int(time.mktime(t))
                    #
                    # item['pubDate'] = timeStamp

                    item['pubDate'] =''
                    item['title'] = li.xpath('./a/text()').get()
                    url = r'http://www.cs.com.cn/sylm/jsbd/' + li.xpath('./a/@href').get()

                    yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_item, dont_filter=True)

            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "中证网"
                # 出错的描述
                item['desc'] = '解析html元素错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                yield item
Ejemplo n.º 5
0
    def parse_item(self, response):
        if response.status == 200:
            try:
                item = response.meta['item']
                item['content'] = response.xpath('//div[@class="article-t hidden"]/p/text()').get()
                # TODO
                item['isRed'] = 0
                yield item
            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "中证网"
                # 出错的描述
                item['desc'] = '解析html元素错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                yield item

        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "中证网"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 6
0
    def parse_item(self, response):
        if response.status == 200:

            lis = response.xpath('//ul[@class="nf-list"]/li')
            if lis is None or len(lis) == 0:
                item = ErrorItem()
                item['code'] = 801
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "上证快讯"
                # 出错的描述
                item['desc'] = '未找到html元素'
                # 代码报出的错误
                item['exception'] = ''
                return item
            try:
                # 日期:2019年05月16日
                # riqi = response.xpath('//div[@class="nf-head"]/p/text()').get().strip()
                for li in lis:
                    item = NewsItem()
                    item['source'] = "cnstock"
                    # temp = li.xpath('./p[1]/text()').get()  # 如:20:30
                    # temp = riqi + temp  # 2019年05月16日20:30
                    #
                    # d = datetime.datetime.strptime(temp, "%Y年%m月%d日%H:%M")
                    # t = d.timetuple()
                    # timeStamp = int(time.mktime(t))
                    #
                    # item['pubDate'] = timeStamp

                    item['pubDate'] = ''
                    title_conent = li.xpath('./p[2]/a/text()').get()

                    # 如:
                    '''
                    【压垮乐视网的最后一根稻草竟然是它!】15日,进入暂停上市状态第三天的乐视网披露,因乐视体育经营不利导致增资协议中的对赌条款失败,乐视体育股东之一的前海思拓提出的涉及回购融资股权的仲裁申请,得到了北京仲裁委员会的支持。
                    '''
                    item['title'] = (re.findall('【.*】', title_conent)[0]).replace('【', '').replace('】', '')
                    item['content'] = re.findall('】.*', title_conent)[0].replace('】', '')


                    item['isRed'] = 0
                    yield item
            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "上证快讯"
                # 出错的描述
                item['desc'] = '解析html标签错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                yield item

        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "上证快讯"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 7
0
    def parse_item(self, response):

        if response.status == 200:
            try:
                # 不是格式正确的json,一前一后需要加上'[' ']',
                data_list = json.loads('[' + response.body.decode() + ']')
                # print(data_list)
            except Exception as e:
                # print(e)
                item = ErrorItem()
                item['code'] = 901
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的url
                item['site'] = "选股宝"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] =  str(e)
                yield item
            else:
                try:
                    for data in data_list[0]["NewMsgs"]:
                        item = NewsItem()
                        # item['flag'] = 1
                        item['source'] = 'xuangubao'
                        item['pubDate'] = data['UpdatedAtInSec']
                        item['title'] = data['Title']
                        item['content'] = data['Summary']
                        # TODO
                        item['isRed'] = data['Impact']
                        yield item

                except Exception as e:
                    # print(e)
                    item = ErrorItem()
                    item['code'] = 902
                    # 出错的页面
                    item['url'] = response.url
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的url
                    item['site'] = "选股宝"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] =  str(e)
                    yield item

        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的url
            item['site'] = "选股宝"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 8
0
    def parse_item(self, response):
        if response.status == 200:
            lis = response.xpath('//ul[@class="live-list"]/li')
            if lis is None or len(lis) == 0:
                item = ErrorItem()
                item['code'] = 801
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "每经网"
                # 出错的描述
                item['desc'] = '未找到html元素'
                # 代码报出的错误
                item['exception'] = ''
                return item

            try:
                riqi = response.xpath(
                    '//p[@class="live"]/span/text()').getall()  # 如:2019年05月20日
                date = ''
                for temp in riqi:
                    if "年" in temp:
                        date = temp.replace("\n",
                                            "").replace("\n\r", "").replace(
                                                "\r\n",
                                                "").replace("\r", "").strip()
                        break

                for li in lis:
                    i = NewsItem()
                    i['source'] = "nbd"

                    timeStamp = ''
                    try:
                        temp = (li.xpath(
                            './div[@class="li-title"]/p/span/text()').get())
                        # 如:17:44:42

                        temp = temp.replace("\n",
                                            "").replace("\n\r", "").replace(
                                                "\r\n",
                                                "").replace("\r", "").strip()

                        temp = date + temp  # 如:2019年05月16日 18:26:27

                        d = datetime.datetime.strptime(temp,
                                                       "%Y年%m月%d日%H:%M:%S")
                        t = d.timetuple()
                        timeStamp = time.mktime(t)
                    except Exception as e:
                        print(e)
                        i['pubDate'] = ""
                    else:
                        i['pubDate'] = timeStamp

                    i['title'] = ""
                    i['content'] = li.xpath(
                        './div[@class="li-text"]/a/text()').get()
                    # TODO
                    i['isRed'] = 0
                    yield i
            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "每经网"
                # 出错的描述
                item['desc'] = '解析html标签错误'
                # 代码报出的错误
                item['exception'] = str(e)
                yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "每经网"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 9
0
    def parse_item(self, response):

        if response.status == 200:
            try:
                data_list = json.loads(response.body.decode())
                # print(data_list)
            except Exception as e:
                print(e)
                item = ErrorItem()
                item['code'] = 800
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                item['timestamp'] = time.time()
                # 出错的url
                item['site'] = "第一财经"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] = str(e)
                yield item
            else:
                try:
                    for data in data_list:
                        item = NewsItem()
                        item['source'] = 'yicai'
                        date = data['datekey'] + " " + data[
                            'hm']  # 如:2019.05.16 20:43
                        # print('<<<<<<<<<<< ' + temp + ' >>>>>>>>>>>')
                        d = datetime.datetime.strptime(date, "%Y.%m.%d %H:%M")
                        t = d.timetuple()
                        item['pubDate'] = int(time.mktime(t))

                        # print(item)
                        title_conent = data['newcontent']

                        # 如:【传化智联:非公开发行股票方案到期失效】 传化智联5月16日晚间公告,公司于2017年度股东大会审议通过《关于公司非公开发行股票方案的议案》,因资本市场环境变化等因素,公司此次非公开发行股票事项尚未取得实质进展。目前,此次非公开发行股票方案到期自动失效。 ",
                        item['title'] = re.findall('【.*】',
                                                   title_conent)[0].replace(
                                                       '【',
                                                       '').replace('】', '')
                        item['content'] = re.findall('】.*',
                                                     title_conent)[0].replace(
                                                         '】', '')

                        # TODO
                        item['isRed'] = 0
                        yield item
                except Exception as e:
                    item = ErrorItem()
                    item['code'] = 902
                    # 出错的页面
                    item['url'] = response.url
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的url
                    item['site'] = "第一财经"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] = str(e)
                    yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的url
            item['site'] = "第一财经"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 10
0
    def parse_item(self, response):
        if response.status == 200:
            try:
                temp = re.findall('__NEXT_DATA__.*module',
                                  response.body.decode(), re.S)[0].replace(
                                      '__NEXT_DATA__ =',
                                      '').replace('__NEXT_DATA__',
                                                  '').replace('module',
                                                              '').strip()
                data_list = json.loads('[' + temp + ']')
                # print(data_list)
            except Exception as e:
                print(e)
                item = ErrorItem()
                item['code'] = 800
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "财联社"
                # 出错的描述
                item['desc'] = '响应的json数据错误'
                # 代码报出的错误
                item['exception'] = str(e)
                yield item
            else:
                try:
                    for data in data_list[0]["props"]['initialState'][
                            'telegraph']['dataList']:
                        item = NewsItem()
                        item['source'] = 'cls'

                        item['pubDate'] = data['modified_time']
                        item['title'] = data['title']
                        #
                        if '【' in data['content'] and '】' in data['content']:
                            item['content'] = re.findall(
                                '】.*', data['content'])[0].replace("】", '')
                        else:
                            item['content'] = data['content']

                        # TODO
                        item['isRed'] = 0
                        yield item
                except Exception as e:
                    item = ErrorItem()
                    item['code'] = 902
                    # 出错的页面
                    item['url'] = response.url
                    # 出错的时间
                    # item['timestamp'] = time.time()
                    # 出错的网站
                    item['site'] = "财联社"
                    # 出错的描述
                    item['desc'] = '解析json数据错误'
                    # 代码报出的错误
                    item['exception'] = str(e)
                    yield item
        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "财联社"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item
Ejemplo n.º 11
0
    def parse_item(self, response):

        if response.status == 200:
            divs = response.xpath('//div[@class="m_feed_item"]')
            if divs is None or len(divs) == 0:
                item = ErrorItem()
                item['code'] = 801
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "上证e互动"
                # 出错的描述
                item['desc'] = '未找到html元素'
                # 代码报出的错误
                item['exception'] = ''
                return item
            try:
                for div in divs:
                    q_a = div.xpath(
                        './/div[@class="m_feed_txt"]/text()').getall()
                    # 得到的是问题 、回答 、再加一个''组成的数组

                    item = QuestionsAnswersItem()
                    item['source'] = 'sseinfo'

                    temp = div.xpath('.//div[@class="m_feed_txt"]/a/text()'
                                     ).get()  # 如::中国化学(601117)
                    item['stockcode'] = re.findall('\d{6}', temp)[0]

                    # temp_list = div.xpath('.//div[@class="m_feed_from"]/span/text()').getall()
                    # 得到的是问题的时间和回答的时间组成的数组
                    # 如:
                    # 04月08日 18:35
                    # 16分钟前
                    #也可能是:
                    # 04月08日 18:35
                    # 1小时前
                    #
                    # num = int(re.findall('\d+', temp_list[1])[0])
                    #
                    # d = ''
                    # if '小时' in temp_list[1]: # 提取小时数
                    #     d = datetime.datetime.now() - datetime.timedelta(hours=num)
                    # if '分钟' in temp_list[1]: # 提取分钟数
                    #     d = datetime.datetime.now() - datetime.timedelta(minutes=num)
                    #
                    # t = d.timetuple()
                    # timeStamp = int(time.mktime(t))
                    #
                    # item['timestamp'] = timeStamp
                    item['pubDate'] = ''
                    item['question'] = q_a[1]
                    item['answer'] = q_a[2]
                    yield item

            except Exception as e:
                item = ErrorItem()
                item['code'] = 802
                # 出错的页面
                item['url'] = response.url
                # 出错的时间
                # item['timestamp'] = time.time()
                # 出错的网站
                item['site'] = "上证e互动"
                # 出错的描述
                item['desc'] = '解析html标签错误'
                # 代码报出的错误
                item['exception'] = str(e)
                yield item

        else:
            item = ErrorItem()
            item['code'] = response.status
            # 出错的页面
            item['url'] = response.url
            # 出错的时间
            # item['timestamp'] = time.time()
            # 出错的网站
            item['site'] = "上证e互动"
            # 出错的描述
            item['desc'] = '响应错误'
            # 代码报出的错误
            item['exception'] = ''
            yield item