Ejemplo n.º 1
0
    def parse(self, response):
        all_item = []
        print('-----------------------------------')
        print(response.url)
        print('-----------------------------------')
        # item = FinanceCrawlItem()
        json_datas = json.loads(response.text)
        datas = json_datas['items']['data']
        pattern = re.compile("<mark>.*</mark>")
        for data in datas:
            item = FinanceCrawlItem()
            time_num = int(data['publishAt'])
            date = datetime.fromtimestamp(time_num).strftime("%Y%m%d")
            content = pattern.search(data['content'])
            content = content.group()
            if content:
                content = content.replace('<mark>', '').replace('</mark>', '')
                item['company'] = content
            else:
                item['company'] = response.meta['company']
            item['date'] = date
            item['title'] = data['title']
            item['link'] = 'https://news.cnyes.com/news/id/' + str(
                data['newsId'])

            item['due_date'] = response.meta['date']
            all_item.append(item)
        return all_item
Ejemplo n.º 2
0
 def parse_product(self, response):
     item = FinanceCrawlItem()
     # print(response.url)
     source = BeautifulSoup(response.text, 'lxml')
     title = source.select_one('h1.margin_b20').text
     time = source.select_one('div.icon_time').text
     text = source.select_one('div#news_detail_div').text.replace(
         '\n', '').replace('\t\t\t\t\t   \t\t', '')
     item['time'] = time
     item['title'] = title
     item['link'] = response.url
     item['text'] = text
     return item
Ejemplo n.º 3
0
 def parse_product(self, response):
     source = BeautifulSoup(response.text, 'lxml')
     item = FinanceCrawlItem()
     title = source.select_one('h1').text
     time = source.select_one('div.ndArticle_creat').text.replace(
         '出版時間:', '')
     texts = source.select('div.ndArticle_margin p')
     text = texts[0].text.replace('\xa0', '').replace('\t', '')
     if text == '':
         text = texts[1].text.replace('\xa0', '').replace('\t', '')
     item['time'] = time
     item['title'] = title
     item['link'] = response.url
     item['text'] = text
     return item
Ejemplo n.º 4
0
 def parse_product(self, response):
     item = FinanceCrawlItem()
     title = response.meta['title']
     link = response.meta['link']
     source = BeautifulSoup(response.text, 'lxml')
     time = source.select_one('span.time').text
     texts = source.select('div.text p')[:-4]
     text = ''
     for t in texts:
         text = text + t.text
     item['time'] = time
     item['title'] = title
     item['link'] = link
     item['text'] = text
     return item
Ejemplo n.º 5
0
 def parse_product(self, response):
     item = FinanceCrawlItem()
     title = response.meta['title']
     link = response.meta['link']
     source = BeautifulSoup(response.text, 'lxml')
     time = source.select_one('time.entry-date').text
     all_text = source.select('div.td-post-content p')
     text =''
     for t in all_text[1:]:
         if 'figure' not in t:
             text = text+t.text
     item['title'] = title
     item['link'] = link
     item['time'] = time
     item['text'] = text
     return item
Ejemplo n.º 6
0
    def parse(self, response):
        item = FinanceCrawlItem()
        due_date = response.meta['date']
        company = response.meta['company']
        source = BeautifulSoup(response.text, 'lxml')
        news_tags = source.select('div.newsimg-area-item-2 ')
        for i in news_tags:
            url = i.select_one('a.gt').get('href')
            title = i.select_one('div.newsimg-area-text-2 ').text
            news_date = i.select_one('div.label-area div.newsimg-date').text[:11]
            print(title, url, news_date, due_date)
            item['company'] = company
            item['date'] = news_date
            item['due_date'] = due_date
            item['title'] = title
            item['link'] = 'https://www.setn.com/'+url
            return item
            # if 
            # for tag in news_a_tag:
            #     meta = {}
            #     meta['link'] = 'https://www.setn.com' + tag.get('href')
            #     item['title'] = tag.text
            #     item['link'] = link
            #     item['time'] = time
            #     item['date'] = text
                # yield scrapy.Request(meta['link'], callback = self.parse_product, meta=meta)
            # self.page_num += 1

        #     if self.page_num != 25:
        #         yield scrapy.Request('https://www.setn.com/ViewAll.aspx?PageGroupID=2&p='+str(self.page_num), callback = self.parse)
        # else:
        #     print('done')
    # def parse_product(self, response):
    #     item = FinanceCrawlItem()
    #     title = response.meta['title']
    #     link = response.meta['link']
    #     source = BeautifulSoup(response.text, 'lxml')
    #     time = source.select_one('time.page-date').text
    #     texts = source.select('article div#Content1 p')
    #     text = ''
    #     for t in texts:
    #         text += t.text
    #     item['title'] = title
    #     item['link'] = link
    #     item['time'] = time
    #     item['text'] = text
    #     return item
Ejemplo n.º 7
0
    def parse_product(self, response):
        item = FinanceCrawlItem()
        title = response.meta['title']
        link = response.meta['link']
        source = BeautifulSoup(response.text, 'lxml')
        time = source.select_one('time.date')
        if time:
            time = time.get('datetime')
        else:
            time = source.select_one('time.news-time').get('datetime')

        texts = source.select('div.story p')
        text = ''
        for t in texts:
            text = text + t.text
        item['time'] = time[:-9]
        item['title'] = title.replace('\u3000', '')
        item['link'] = link
        item['text'] = text
        return item