def parse(self, response): bsObj = BeautifulSoup(response.text, "lxml") bs = bsObj.find(attrs={ 'class': 'markdown-content content' }).find_all('p')[:-3] title = bsObj.find(attrs={'class': 'article-title'}).text # print (bs,title) # print(bsObj.find(['title']).text) content = '' for i in bs: if i.text != '' and i.text not in content: content = f'{content}{i.text}' content = content.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "").replace("\xa0", "").replace('\u3000', '') # print ('标题:'+title+'内容:'+content) item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title item['abstract'] = content[:60] item['content'] = content item['vender'] = '' return item
def parse_2(self, response): # print('parse2') bsObj = BeautifulSoup(response.text, "lxml") # print (bsObj) # print (response.text) content = '' bs = bsObj.find_all(attrs={'class': 'top30'}) for i in bs[2].find_all(['p', 'em'])[:-1]: content = f'{content}{i.text}' title = bsObj.find_all('h1')[1].text content = content.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "").replace("\xa0", "").replace('| 区块链爱好者_区块链技术_区块链开发_区块链是什么', '') # print ('标题:'+title+'内容:'+content) item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title.replace('\n', '').replace("\r", "").replace( " ", "").replace("\t", "").replace("\xa0", "").replace('\u3000', '') item['abstract'] = content[:60] item['content'] = content item['vender'] = '' return item
def parse_3(self, response): bsObj = BeautifulSoup(response.text, "lxml") bs = bsObj.find_all(name='div', attrs={'class': 'para'}) content = "" for i in bs: content = f'{content}{i.text}' item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = response.meta['key'] item['abstract'] = content[:40].replace('\n', '').replace("\r", "").replace(" ", "") item['content'] = content.replace('\n', '').replace("\r", "").replace(" ", "").replace('\xa0', '') item['vender'] = '' return item
def parse(self, response): bsObj = BeautifulSoup(response.text, "lxml") bs = bsObj.find(attrs={'class': 'storyBody'}) content = bs.text title = bsObj.find(['h1']).text content = content.replace("\n", "").replace("\r", "").replace( "\t", "").replace("\xa0", "").replace('\u3000', '') item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title item['summary'] = content[:60] item['content'] = content item['vender'] = '' return item
def parse_2(self,response): bsObj = BeautifulSoup(response.text, "lxml") bs = bsObj.find_all('p') content = '' for i in bs: content = f'{content}{i.text}' title = bsObj.find_all('h3')[0].text content = content.replace("\n", "").replace("\r", "") print ('标题:'+title+'内容:'+content) item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title.replace('\n', '').replace("\r", "").replace(" ", "") item['abstract'] = content[:40] item['content'] = content item['vender'] = '' return item
def parse_2(self,response): bsObj = BeautifulSoup(response.text, "lxml") ma = bsObj.find(attrs={'class': 'maintext'}) bs = ma.find_all('p') content = '' for i in bs: content = f'{content}{i.text}' title = bsObj.find(attrs={'class': 'sctopbannercon'}).h1.text content = content.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "") # print ('标题:'+title+'内容:'+content) item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "") item['summary'] = content[:40] item['content'] = content item['vendor'] = '' return item
def parse(self, response): res = json.loads(response.text) result_list = res['data']['hits']['hits'] for index, value in enumerate(result_list): item = AntfinItem() item['title'] = value['_source']['title'] item['abstract'] = value['highlight']['content'] item['type'] = value['_source']['type'] item['content'] = value['_source']['content'] yield item total = res['data']['hits']['total'] query = response.meta.get('query') if total > 10: for i in range(10, total, 10): yield scrapy.Request(self.url_format.format(query=query, start=i), callback=self.parse)
def parse_2(self,response): # print('parse2') bsObj = BeautifulSoup(response.text, "lxml") # print (response.text) bs = bsObj.find(attrs={'class': 'article-content'}) content=bs.text # for i in bs.find_all('p')[:-2]: # content = f'{content}{i.text}' title = bsObj.find('h3').text content = content.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "").replace("\xa0", "").replace('\u3000','') # print ('标题:'+title+'内容:'+content) item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title.replace('\n', '').replace("\r", "").replace(" ", "").replace("\t", "").replace("\xa0", "").replace('\u3000','') item['abstract'] = content[:60] item['content'] = content item['vender'] = '' return item
def parse(self, response): bsObj = BeautifulSoup(response.text, "lxml") # print (response.text) bs = bsObj.find(attrs={'class':'a-content'}) bs=bs.find_all(['p','b']) # print(bsObj.find(['title']).text) content = '' for j in bs: content = f'{content}{j.text}' title = bsObj.find(['h1']).text content = content.replace("\n", "").replace("\r", "").replace(" ", "").replace("\t", "").replace("\xa0","").replace('\u3000', '') # print ('标题:'+title+'内容:'+content) item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title item['abstract'] = content[:60] item['content'] = content item['vender'] = '' return item
def parse(self, response): bsObj = BeautifulSoup(response.text, "lxml") # print (response.text) # print('parse') # print (bsObj) bs = bsObj.find(attrs={'class': 'article-content'}) content = bs.text bs = bsObj.find(attrs={'class': 'article-page'}) title = bs.find('h1').text content = content.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "").replace("\xa0", "").replace('\u3000', '') # print ('标题:'+title+'内容:'+content) item = AntfinItem() item['type'] = 2 item['url'] = response.meta['url'] item['title'] = title item['abstract'] = content[:60] item['content'] = content item['vender'] = '' return item
def parse_2(self, response): # print('parse2') bsObj = BeautifulSoup(response.text, "lxml") # print (response.text) contain = [ 'solution-trends-new-v5', 'solution-scene-v5', 'solution-framework-v5', 'solution-trends-v5', 'solution-challenge-v5', 'solution-architecturel-v5', 'solution-advantage-v5', 'solution-service-v5' ] content = '' for i in contain: a = bsObj.find(attrs={'class': i}) if a != None: text = (is_ustr(a.text)) content = f'{content}{text}' title = bsObj.find('h1').text.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "") content = content.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "").replace("\xa0", "").replace('\u3000', '') # print ('标题:'+title+'内容:'+content) summary = bsObj.find(attrs={'class': 'solution-banner-text'}) if summary != None: summary = summary.text.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "") else: summary = content[:40] item = AntfinItem() item['type'] = 1 item['url'] = response.meta['url'] item['title'] = title item['summary'] = summary item['content'] = content item['vender'] = '华为云' return item
def parse_2(self, response): # print('parse2') bsObj = BeautifulSoup(response.text, "lxml") # print (response.text) contain = [ 'product-advantage-v5', 'product-scene-common-v5', 'product-function-layer', 'product-function2-v5', 'product-guide-v5', 'product-newfeature-v5' ] content = '' for i in contain: a = bsObj.find(attrs={'class': i}) if a != None: text = (is_ustr(a.text)) content = f'{content}{text}' title = bsObj.find('h1').text.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "") content = content.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "").replace("\xa0", "").replace('\u3000', '') # print ('标题:'+title+'内容:'+content) summary = bsObj.find(attrs={'class': 'product-banner-paragraph'}) if summary != None: summary = summary.text.replace("\n", "").replace("\r", "").replace( " ", "").replace("\t", "") else: summary = content[:40] item = AntfinItem() item['type'] = 1 item['url'] = response.meta['url'] item['title'] = title item['summary'] = summary item['content'] = content item['vender'] = '华为云' return item