Exemple #1
0
    def parse(self, response):
        bsObj = BeautifulSoup(response.text, "lxml")
        bs = bsObj.find(attrs={
            'class': 'markdown-content content'
        }).find_all('p')[:-3]
        title = bsObj.find(attrs={'class': 'article-title'}).text
        # print (bs,title)
        #         print(bsObj.find(['title']).text)
        content = ''
        for i in bs:
            if i.text != '' and i.text not in content:
                content = f'{content}{i.text}'

        content = content.replace("\n", "").replace("\r", "").replace(
            " ", "").replace("\t", "").replace("\xa0",
                                               "").replace('\u3000', '')
        # print ('标题:'+title+'内容:'+content)
        item = AntfinItem()
        item['type'] = 2
        item['url'] = response.meta['url']
        item['title'] = title
        item['abstract'] = content[:60]
        item['content'] = content
        item['vender'] = ''
        return item
Exemple #2
0
 def parse_2(self, response):
     # print('parse2')
     bsObj = BeautifulSoup(response.text, "lxml")
     # print (bsObj)
     # print (response.text)
     content = ''
     bs = bsObj.find_all(attrs={'class': 'top30'})
     for i in bs[2].find_all(['p', 'em'])[:-1]:
         content = f'{content}{i.text}'
     title = bsObj.find_all('h1')[1].text
     content = content.replace("\n", "").replace("\r", "").replace(
         " ",
         "").replace("\t",
                     "").replace("\xa0",
                                 "").replace('| 区块链爱好者_区块链技术_区块链开发_区块链是什么',
                                             '')
     # print ('标题:'+title+'内容:'+content)
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = title.replace('\n', '').replace("\r", "").replace(
         " ", "").replace("\t", "").replace("\xa0",
                                            "").replace('\u3000', '')
     item['abstract'] = content[:60]
     item['content'] = content
     item['vender'] = ''
     return item
Exemple #3
0
 def parse_3(self, response):
     bsObj = BeautifulSoup(response.text, "lxml")
     bs = bsObj.find_all(name='div', attrs={'class': 'para'})
     content = ""
     for i in bs:
         content = f'{content}{i.text}'
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = response.meta['key']
     item['abstract'] = content[:40].replace('\n', '').replace("\r", "").replace(" ", "")
     item['content'] = content.replace('\n', '').replace("\r", "").replace(" ", "").replace('\xa0', '')
     item['vender'] = ''
     return item
Exemple #4
0
 def parse(self, response):
     bsObj = BeautifulSoup(response.text, "lxml")
     bs = bsObj.find(attrs={'class': 'storyBody'})
     content = bs.text
     title = bsObj.find(['h1']).text
     content = content.replace("\n", "").replace("\r", "").replace(
         "\t", "").replace("\xa0", "").replace('\u3000', '')
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = title
     item['summary'] = content[:60]
     item['content'] = content
     item['vender'] = ''
     return item
Exemple #5
0
 def parse_2(self,response):
     bsObj = BeautifulSoup(response.text, "lxml")
     bs = bsObj.find_all('p')
     content = ''
     for i in bs:
         content = f'{content}{i.text}'
     title = bsObj.find_all('h3')[0].text
     content = content.replace("\n", "").replace("\r", "")
     print ('标题:'+title+'内容:'+content)
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = title.replace('\n', '').replace("\r", "").replace(" ", "")
     item['abstract'] = content[:40]
     item['content'] = content
     item['vender'] = ''
     return item
Exemple #6
0
 def parse_2(self,response):
     bsObj = BeautifulSoup(response.text, "lxml")
     ma = bsObj.find(attrs={'class': 'maintext'})
     bs = ma.find_all('p')
     content = ''
     for i in bs:
         content = f'{content}{i.text}'
     title = bsObj.find(attrs={'class': 'sctopbannercon'}).h1.text
     content = content.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "")
     # print ('标题:'+title+'内容:'+content)
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = title.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "")
     item['summary'] = content[:40]
     item['content'] = content
     item['vendor'] = ''
     return item
Exemple #7
0
    def parse(self, response):
        res = json.loads(response.text)
        result_list = res['data']['hits']['hits']
        for index, value in enumerate(result_list):
            item = AntfinItem()
            item['title'] = value['_source']['title']
            item['abstract'] = value['highlight']['content']
            item['type'] = value['_source']['type']
            item['content'] = value['_source']['content']
            yield item

        total = res['data']['hits']['total']
        query = response.meta.get('query')
        if total > 10:
            for i in range(10, total, 10):
                yield scrapy.Request(self.url_format.format(query=query,
                                                            start=i),
                                     callback=self.parse)
Exemple #8
0
 def parse_2(self,response):
     # print('parse2')
     bsObj = BeautifulSoup(response.text, "lxml")
     # print (response.text)
     bs = bsObj.find(attrs={'class': 'article-content'})
     content=bs.text
     # for i in bs.find_all('p')[:-2]:
     #     content = f'{content}{i.text}'
     title = bsObj.find('h3').text
     content = content.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "").replace("\xa0", "").replace('\u3000','')
     # print ('标题:'+title+'内容:'+content)
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = title.replace('\n', '').replace("\r", "").replace(" ", "").replace("\t", "").replace("\xa0", "").replace('\u3000','')
     item['abstract'] = content[:60]
     item['content'] = content
     item['vender'] = ''
     return item
Exemple #9
0
 def parse(self, response):
     bsObj = BeautifulSoup(response.text, "lxml")
     # print (response.text)
     bs = bsObj.find(attrs={'class':'a-content'})
     bs=bs.find_all(['p','b'])
     # print(bsObj.find(['title']).text)
     content = ''
     for j in bs:
         content = f'{content}{j.text}'
     title = bsObj.find(['h1']).text
     content = content.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "").replace("\xa0","").replace('\u3000', '')
     # print ('标题:'+title+'内容:'+content)
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = title
     item['abstract'] = content[:60]
     item['content'] = content
     item['vender'] = ''
     return item
Exemple #10
0
 def parse(self, response):
     bsObj = BeautifulSoup(response.text, "lxml")
     # print (response.text)
     # print('parse')
     # print (bsObj)
     bs = bsObj.find(attrs={'class': 'article-content'})
     content = bs.text
     bs = bsObj.find(attrs={'class': 'article-page'})
     title = bs.find('h1').text
     content = content.replace("\n", "").replace("\r", "").replace(
         " ", "").replace("\t", "").replace("\xa0",
                                            "").replace('\u3000', '')
     # print ('标题:'+title+'内容:'+content)
     item = AntfinItem()
     item['type'] = 2
     item['url'] = response.meta['url']
     item['title'] = title
     item['abstract'] = content[:60]
     item['content'] = content
     item['vender'] = ''
     return item
Exemple #11
0
 def parse_2(self, response):
     # print('parse2')
     bsObj = BeautifulSoup(response.text, "lxml")
     # print (response.text)
     contain = [
         'solution-trends-new-v5', 'solution-scene-v5',
         'solution-framework-v5', 'solution-trends-v5',
         'solution-challenge-v5', 'solution-architecturel-v5',
         'solution-advantage-v5', 'solution-service-v5'
     ]
     content = ''
     for i in contain:
         a = bsObj.find(attrs={'class': i})
         if a != None:
             text = (is_ustr(a.text))
             content = f'{content}{text}'
     title = bsObj.find('h1').text.replace("\n",
                                           "").replace("\r", "").replace(
                                               " ", "").replace("\t", "")
     content = content.replace("\n", "").replace("\r", "").replace(
         " ", "").replace("\t", "").replace("\xa0",
                                            "").replace('\u3000', '')
     # print ('标题:'+title+'内容:'+content)
     summary = bsObj.find(attrs={'class': 'solution-banner-text'})
     if summary != None:
         summary = summary.text.replace("\n", "").replace("\r", "").replace(
             " ", "").replace("\t", "")
     else:
         summary = content[:40]
     item = AntfinItem()
     item['type'] = 1
     item['url'] = response.meta['url']
     item['title'] = title
     item['summary'] = summary
     item['content'] = content
     item['vender'] = '华为云'
     return item
Exemple #12
0
 def parse_2(self, response):
     # print('parse2')
     bsObj = BeautifulSoup(response.text, "lxml")
     # print (response.text)
     contain = [
         'product-advantage-v5', 'product-scene-common-v5',
         'product-function-layer', 'product-function2-v5',
         'product-guide-v5', 'product-newfeature-v5'
     ]
     content = ''
     for i in contain:
         a = bsObj.find(attrs={'class': i})
         if a != None:
             text = (is_ustr(a.text))
             content = f'{content}{text}'
     title = bsObj.find('h1').text.replace("\n",
                                           "").replace("\r", "").replace(
                                               " ", "").replace("\t", "")
     content = content.replace("\n", "").replace("\r", "").replace(
         " ", "").replace("\t", "").replace("\xa0",
                                            "").replace('\u3000', '')
     # print ('标题:'+title+'内容:'+content)
     summary = bsObj.find(attrs={'class': 'product-banner-paragraph'})
     if summary != None:
         summary = summary.text.replace("\n", "").replace("\r", "").replace(
             " ", "").replace("\t", "")
     else:
         summary = content[:40]
     item = AntfinItem()
     item['type'] = 1
     item['url'] = response.meta['url']
     item['title'] = title
     item['summary'] = summary
     item['content'] = content
     item['vender'] = '华为云'
     return item