def parse_news(self, response): soup = bs(response.text, "html.parser") item = DemoItem() item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] pub_time = soup.find( "time", "entry-date updated td-module-date").text.strip() if soup.find( "time", "entry-date updated td-module-date") else "0000-00-00 00:00:00" item["pub_time"] = Util.format_time2(pub_time) title = soup.find("h1", "entry-title").text.strip() if soup.find( "h1", "entry-title") else None item["title"] = title div = soup.find("div", "td-post-content tagdiv-type") images = [img.get("src") for img in div.find_all("img") ] if div.find_all("img") else None abstract = div.find("p").text.strip() body = [p.text.strip() for p in div.find_all("p")] if div.find_all("p") else None if abstract: body = "\n".join(body) else: abstract = div.find("h4").text.strip() body = [h.text.strip() for h in div.find_all("h4") ] if div.find_all("h4") else None body = "\n".join(body) item["images"] = images item["abstract"] = abstract item["body"] = body self.logger.info(item)
def parse_news(self, response): item = DemoItem() soup = bs(response.text, "html.parser") item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] item["title"] = soup.find(class_="post-title entry-title").text.strip() item["pub_time"] = Util.format_time2( soup.find(class_="published timeago").text.strip()) content = soup.find(class_="post-body entry-content") images = [img.get("src") for img in content.find_all("img") ] if content.find_all("img") else [] item["images"] = images body1 = '' for div in content.find_all(dir="ltr"): body1 += (div.text.strip() + '\n') if body1 == '': body1 = content.text body = '' for b in body1.split("\n"): if b != '': body += (b + '\n') item["body"] = body item["abstract"] = body.split("\n")[0] yield item
def parse_detail(self, response): item = DemoItem() soup = BeautifulSoup(response.text, features="lxml") item['title'] = soup.select_one( ".o-article .entry-content h1").text.strip() item['pub_time'] = khulasaa_time_switch( soup.select_one(".author-disc .date .author span").text) images = [] for img in soup.select(".content-section .featured-box img"): images.append(img.get("src")) item['images'] = images abstract = "" for a in soup.select(".post-content ul li h3"): abstract += a.text.strip() abstract += "\n" item['abstract'] = abstract body = "" for b in soup.select(".post-content p"): body += b.text.strip() item['body'] = body item['category1'] = soup.select_one( ".breadcrumb span span span a").text.strip() item['category2'] = None item['request_url'] = response.request.url item['response_url'] = response.url item['website_id'] = self.website_id item['language_id'] = self.language_id item['cole_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) yield item
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() category = response.url.split('/')[-3].split('_') if len(category) == 3: item['category1'] = category[1] item['category2'] = category[2] else: item['category1'] = category[0] item['category2'] = category[1] item['title'] = soup.select_one('h1.entry-title').text item['pub_time'] = Util.format_time2( soup.select('span.td-post-date > time')[0].text) item['images'] = [ i.get('data-src') for i in soup.select('div.td-post-content img') ] item['abstract'] = soup.select('div.td-post-content > p')[0].text ss = '' for i in soup.select('div.td-post-content > p'): ss += i.text + r'\n' item['body'] = ss return item
def parse(self, response): ''' :param response: :return:一级目录链接 ''' item = DemoItem() soup = bs(response.text, "html.parser") for li in soup.select("#menu-main-menu-1 > li.menu-item")[1:-2]: a = li.select_one("a") item['category1'] = a.text category1_url = a.get("href") if li.find("ul", class_="sub-menu"): for sub_a in li.find("ul", class_="sub-menu").select("a"): item['category2'] = sub_a.text category2_url = sub_a.get("href") yield scrapy.Request(category2_url, callback=self.get_next_page, meta={"item": item}) # 层与层之间通过meta参数传递数据 else: item['category2'] = None yield scrapy.Request(category1_url, callback=self.get_next_page, meta={"item": item}) # 层与层之间通过meta参数传递数据
def parse_news(self, response): soup = bs(response.text) item = DemoItem() item["pub_time"] = Util.format_time2( soup.select('.post-meta > span')[1].text) title = soup.find( "h1", class_="post-title entry-title").text.strip() if soup.find( "h1", class_="post-title entry-title") else None item["title"] = title image = [ soup.find("div", class_="single-post-thumb").find("img").get("src") ] if soup.find("div", class_="single-post-thumb") else None item["images"] = image category1 = soup.select_one( "#main-content > div > article > div > p > span:nth-child(3) > a" ).text.strip() item["category1"] = category1 item["category2"] = None abstract = soup.find( "div", class_="entry").find("p").text.strip() if soup.find( "div", class_="entry") else soup.find( "div", class_="entry").find("p").text.strip() item["abstract"] = abstract body = [ p.text.strip() for p in soup.find("div", class_="entry").find_all("p") ] if soup.find("div", class_="entry") else None body = "\n".join(body) item["body"] = body yield item
def parse_details(self,response): item=DemoItem() soup=BeautifulSoup(response.text,'lxml') item['category1']=response.meta['category1'] item['category2']=response.meta['category2'] item['title']=soup.find('h1',class_='post-title entry-title').text.strip() if soup.find('h1',class_='post-title entry-title') else None item['body'] = ''#不能忘记初始化 item['abstract']='' if soup.select('.entry-content p,.entry-content h3'): body_list=soup.select('.entry-content p,.entry-content h3')#这个写法可以同时提取到多个不同的标签 for body in body_list: item['body'] += body.text.strip() item['body'] +='\n' item['abstract']=body_list[0].text.strip() item['images']=[] image_list=soup.select('.entry-content p>img,.single-featured-image>img')if soup.select('.entry-content p>img,.single-featured-image>img') else None if(image_list): for image in image_list: image=image.get('src') item['images'].append(image) pub=soup.find('span',class_='date meta-item tie-icon').text.strip() if soup.find('span',class_='date meta-item tie-icon') else None if(pub): pub=Util.format_time2(pub) item['pub_time']=pub yield item
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() item['title'] = response.meta['title'] item['category1'] = response.meta['category1'] item['abstract'] = response.meta['abstract'] item['images'] = response.meta['images'] item['category2'] = response.meta['category2'] if re.findall('headline', response.url): # 一般新闻 ss = '' for i in soup.select('.dit > p > b'): ss += i.text + '\n' try: ss += soup.select_one('.dit > p > span').text except: pass item['body'] = ss tt = soup.select_one('.colort').text.split( ) # 形如 ['Wednesday', '6', 'January', '2021', '02:12:12', 'PM'] tt = tt[2] + ' ' + tt[1] + ' ' + tt[3] + ' ' + tt[4] + ' ' + tt[ 5] # 形如 January 6 2021 02:12:12 PM item['pub_time'] = Util.format_time2(tt) elif re.findall('watchvid', response.url): # 视频新闻 item['body'] = soup.select_one('.dit > p').text item['pub_time'] = soup.select_one('.colort').text else: # 图片新闻 item['body'] = soup.select_one('.news_saa > p').text item['pub_time'] = Util.format_time(0) return item
def parse_item(self, response): soup = bs(response.text, 'html.parser') item = DemoItem() item['category1'] = response.meta['category1'] item['category2'] = None # 只好用url 里面的数字代替二级标题了。 # 解决了,加meta 参数到Request() item['title'] = soup.select('div.page-header h1')[0].text ts = soup.select('span.date ')[0].text # 文章时间字符串例如 ts = 'Published October 22, 2020, 4:32 PM' #下面将ts 格式化 month = Util.month2[ts.split(',')[0].split(' ')[1]] date = ts.split(',')[1] + '-' + month + '-' + ts.split(',')[0].split(' ')[2] date.strip() # 去掉多余的空格 ttt = ts.split(',')[-1].split(' ') # ttt = ['', '4:32', 'PM'] if ttt[-1] == 'PM': shi = int(ttt[-2].split(':')[0]) + 12 time = str(shi) + ":" + ttt[-2].split(':')[1] + ":" + '00' else: shi = int(ttt[-2].split(':')[0]) time = str(shi) + ":" + ttt[-2].split(':')[1] + ":" + '00' datetime = date + ' ' + time item['pub_time'] = datetime try: item['images'] = [i.get('src') for i in soup.select('div.page-content img')] except: pass item['abstract'] = soup.select('div.page-content > p')[0].text ss = '' for i in soup.select('div.page-content > p'): ss += i.text + r'\n' item['body'] = ss yield item
def parse(self, response): for tr in response.xpath('//table[@class="Tab"]//tr[position()>1]'): itemRow = { 'demo': tr.xpath('td[1000]/a/@id').get().strip(), 'ss': tr.xpath('td[2]/a/@id').extract_first(default='').strip(), 'id': tr.xpath('td[2]/a/@id').extract_first(), 'registryNo': tr.xpath('td[2]/a/text()').re_first(r'\w+'), 'status': tr.xpath('td[3]/a/text()'), 'drugName': tr.xpath('td[4]/text()'), 'shutZ': tr.xpath('td[5]/text()'), 'title': tr.xpath('td[6]/text()'), } print(itemRow) l = ItemLoader(item=DemoItem(), response=response) l.add_xpath('name', '') l.add_css('ff', '') return l.load_item() #yield from response.follow_all(anchors, callback=self.parse) # next_page = response.css('li.next a::attr(href)').get() # if next_page is not None: # next_page = response.urljoin(next_page) # yield scrapy.Request(next_page, callback=self.parse) # yield response.follow(href, callback=self.parse) yield itemRow
def parse(self, response): # 新闻列表 有完整新闻 soup = BeautifulSoup(response.text, 'html.parser') flag = True last_pub_time= Util.format_time2(soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')[-1].text) if self.time is None or Util.format_time3(last_pub_time) >= int(self.time): all_pub_time = [Util.format_time2(i.text) for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')] all_title = [i.text.strip() for i in soup.select('.lk-tle')] all_images = ['https://www.myanmarisis.org'+i.get('src') for i in soup.select('.img-responsive.lk-img')] all_body = [i.text.strip() for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div p')] for i in range(9): item=DemoItem() item['pub_time'] = all_pub_time[i] item['images'] = [all_images[i]] item['title'] = all_title[i] item['body'] = all_body[i] item['category1']='event' item['category2'] =None item['abstract'] = all_body[i].split('\n')[0] yield item else: self.logger.info('时间截止!') flag = False if flag: try: nextPage=soup.select_one('.active ~ li a').get('href') yield Request(url=nextPage) except: self.logger.info("Next page no more.")
def parse_detail(self, response): item = DemoItem() soup = BeautifulSoup(response.text) temp_time = soup.select_one('abbr.published').text if soup.select_one( 'abbr.published').text else None adjusted_time = time_adjustment(temp_time) item['pub_time'] = adjusted_time image_list = [] imgs = soup.select('.post-article img') for img in imgs: if img.get('src'): image_list.append(img.get('src')) item['images'] = image_list item['abstract'] = response.meta['abstract'] item['body'] = soup.find( 'div', class_="post-body entry-content").text if soup.find( 'div', class_="post-body entry-content").text else None news_categories = soup.find( 'div', class_="label-head Label").select('a') if soup.find( 'div', class_="label-head Label").select('a') else None item['category1'] = news_categories[0].text if news_categories[ 0].text else None if len(news_categories) >= 2: item['category2'] = news_categories[1].text item['title'] = soup.find( 'h1', class_="post-title entry-title").text if soup.find( 'h1', class_="post-title entry-title").text else None yield item
def parse_news(self, response): item = DemoItem() soup = bs(response.text) item["category1"] = soup.select_one("#m-bread2 > a").text item["category2"] = None title = soup.select_one("#landing-headline > h1").text item["title"] = title pub_time = soup.select("#m-pd2 > span")[-1].text item["pub_time"] = Util.format_time2(pub_time) images = [ img.find("img").get("src") for img in soup.find_all(class_="wp-caption aligncenter") ] if soup.find_all(class_="wp-caption aligncenter") else [] item["images"] = images abstract = soup.find( id="article-content").find("p").text.strip() if soup.find( id="article-content").find("p") else None item["abstract"] = abstract body = '' if soup.find(id="article-content").find_all("p"): for p in soup.find(id="article-content").find_all("p"): body += (p.text.strip() + '\n') item["body"] = body yield item
def parse_detail(self, response): item = DemoItem() soup = BeautifulSoup(response.text, features='lxml') item['pub_time'] = time_adjustment(soup.select_one('span.post_info_date').text.strip() if soup.select_one('span.post_info_date').text.strip() else None) image_list = [] imgs = soup.find('div', class_="post_img static").select('img') if soup.find('div', class_="post_img static").select('img') else None if imgs: for img in imgs: if re.findall(r'data:image/gif',img.get('src')) == []: image_list.append(img.get('src')) item['images'] = image_list p_list = [] if soup.find('div', class_="post_header single").select('p'): all_p = soup.find('div', class_="post_header single").select('p') for paragraph in all_p: p_list.append(paragraph.text) body = '\n'.join(p_list) item['abstract'] = p_list[0] item['body'] = body else: item['abstract'] = soup.find('div', class_="post_header single").select_one('h1').text if soup.find('div', class_="post_header single").select_one('h1').text else None item['body'] = soup.find('div', class_="post_header single").select('h2')[-1].text if soup.find('div', class_="post_header single").select('h2')[-1].text else None item['category1'] = soup.select_one('div.breadcrumb').select('a')[-1].text if soup.select_one('div.breadcrumb').select('a')[-1].text else None item['title'] = soup.select_one('div.post_header_title h1').text if soup.select_one('div.post_header_title h1').text else None yield item
def parse_details(self,response): item=DemoItem() soup=BeautifulSoup(response.text,'lxml') item['category1']=response.meta['category1'] item['category2']=response.meta['category2'] item['title']=soup.find('h1',class_='entry-title').text.strip() if soup.find('h1',class_='entry-title') else None item['body'] = ''#不能忘记初始化 body_list=soup.find('div',class_='entry clearfix').select('p') if soup.find('div',class_='entry clearfix').select('p')else None for body in body_list: item['body'] += body.text.strip() item['body'] +='\n' item['abstract']=soup.find('div',class_='entry clearfix').select('p')[0].text.strip() if soup.find('div',class_='entry clearfix').select('p') else None item['images']=[] image_list=soup.find('div',class_='entry clearfix').select('p>img')if soup.find('div',class_='entry clearfix').select('p>img') else None if(image_list): for image in image_list: image=image.get('src') item['images'].append(image) pub=soup.find('span',class_='updated').text.strip() if soup.find('span',class_='updated').text.strip() else None if(pub): pub=Util.format_time2(pub) item['pub_time']=pub yield item
def parse_news(self, response): item = DemoItem() soup = bs(response.text,"html.parser") category1 = soup.find("div","breadcrumbs header5").find("a").text.strip() if soup.find("div","breadcrumbs header5") else None item["category1"] = category1 category2 = soup.find("div","breadcrumbs header5").find_all("a")[-1].text.strip() if soup.find("div","breadcrumbs header5") else None if category2 == category1: category2 = None item["category2"] = category2 item["pub_time"] = response.meta["pub_time"] # title = soup.find("h1","gtm-articleContent title mb-2 header1").text.strip() if soup.find("h1","gtm-articleContent title mb-2 header1") else None # item["title"] = title item["title"] = response.meta["title"] images = [img.get("src") for img in soup.find("section","article-content data-artcl-cnt").find_all("img")] if soup.find("section","article-content data-artcl-cnt").find_all("img") else None item["images"] = images if soup.find("p", "blurb mb-2 header6"): abstract = soup.find("p","blurb mb-2 header6").text.strip() else: abstract = soup.find("section","article-content data-artcl-cnt").find("p").text.strip() if soup.find("section","article-content data-artcl-cnt") else None item["abstract"] = abstract body = '' if soup.find("section", "article-content data-artcl-cnt"): for p in soup.find("section", "article-content data-artcl-cnt").find_all("p"): body += p.text.strip()+'\n' else: body = None item["body"] = body self.logger.info(item) self.logger.info('\n')
def parse_top(self, response): #rows = response.xpath('//div[@id="react-app"]/div[1]/div/div/div[2]/div[1]/div/div/div[2]/div/div[2]/div[3]/div[1]/div[2]/article') rows = \ response.xpath\ ('//div[@id="react-app"]/div[1]/div/div/div[2]/div[1]/div/div/div[2]/div/div[2]/div[3]/div[1]/div[2]/article//div[@class="srp-list-item-description column"]/a') for row in rows: #Model = row.xpath('.//div[@class="srp-list-item-description column"]/a//span[@class="srp-list-item-basic-info-model"]/text()').extract_first() #Price = row.xpath('.//div[@class="srp-list-item-description column"]/a//div[@class="price-flag"]/span/text()').extract_first(); #Mileage = row.xpath('.//div[@class="srp-list-item-description column"]/a//span[@class="srp-list-item-basic-info-mileage"]/text()').extract_first(); #featuresList = row.xpath('.//div[@class="srp-list-item-description column"]/a//span[@class="srp-list-item-special-features-value"]/text()').extract(); Model = row.xpath( './/span[@class="srp-list-item-basic-info-model"]/text()' ).extract_first() Price = row.xpath( './/div[@class="price-flag"]/span/text()').extract_first() Mileage = row.xpath( './/span[@class="srp-list-item-basic-info-mileage"]/text()' ).extract_first() featuresList = row.xpath( './/span[@class="srp-list-item-special-features-value"]/text()' ).extract() [Body, Color, Engine] = featuresList item = DemoItem() item['Model'] = Model item['Price'] = Price item['Mileage'] = Mileage item['Body'] = Body item['Color'] = Color item['Engine'] = Engine yield item
def parse3(self, response): html = BeautifulSoup(response.text) item = DemoItem() list = response.url.split('/') item['title'] = html.select('.news-title')[0].text item['category1'] = list[3] if re.findall(r'\d+', list[4]) == []: item['category2'] = list[4] item['body'] = '' for i in html.select('.article-content > p'): item['body'] += (i.text + '\n') if html.select('.article-content > p') != []: item['abstract'] = html.select('.article-content > p')[0].text self.logger.info( html.select('.timestamp-entry > .date-posted')[0].text) if html.select('.timestamp-entry > .date-posted') != []: item['pub_time'] = Util.format_time2( html.select('.timestamp-entry > .date-posted')[0].text) else: item['pub_time'] = Util.format_time() if html.select('.article-content > .embed-wrap img') != []: item['images'] = [ html.select('.article-content > .embed-wrap img') [0].attrs['src'], ] yield item
def parse_detail(self, response): item = DemoItem() html = BeautifulSoup(response.text, 'html.parser') item['category1'] = response.meta['category1'] item['category2'] = response.meta['category2'] if html.select_one("div.container h1") is not None: item['title'] = html.select_one("div.container h1").text item['body'] = '' if html.select("div.col-24 p"): bodies = html.select("div.col-24 p") b_list = [b.text for b in bodies] item['body'] = '\n'.join(b_list) item['abstract'] = bodies[0].text item['images'] = [] if html.select("div.col-24 figure img"): images = html.select("div.col-24 figure img") for i in images: item['images'].append(i['src']) if html.select_one("p.byline span.date") is not None: ex = 'Published on (.*)' pub_time = html.select_one("p.byline span.date").text pub_time = re.findall(ex, pub_time) if pub_time: pub_time = pub_time[0] pub_time = Util.format_time2(pub_time) item['pub_time'] = pub_time else: item['pub_time'] = Util.format_time() else: item['pub_time'] = Util.format_time() yield item
def parse_news(self, response): item = DemoItem() soup = bs(response.text, "html.parser") item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] pub_time = soup.find( "span", "entry-meta-date updated").find("a").text.strip() if soup.find( "span", "entry-meta-date updated") else "0000-00-00 00:00:00" if pub_time: item["pub_time"] = Util.format_time2(pub_time) div = soup.find("div", class_="entry-content clearfix") images = [img.get("src") for img in div.find_all("img") ] if div.find_all("img") else None item["images"] = images title = soup.find("h1", class_="entry-title").text.strip() item["title"] = title abstract1 = [a.text.strip() for a in div.find_all("li") ] if div.find_all("li") else div.find("p").text.strip() abstract = '' for a in abstract1: abstract += a item["abstract"] = abstract body = [p.text.strip() for p in div.find_all("p")] if div.find_all("p") else None body = "\n".join(body) item["body"] = body yield item
def parse(self, response): for el in response.xpath('//ul/li'): i = DemoItem() i['title'] = el.xpath('a/text()').extract() i['link'] = el.xpath('a/@href').extract() i['desc'] = el.xpath('text()').extract() yield i
def parse_news(self, response): item = DemoItem() soup = bs(response.text, "html.parser") item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] title = soup.find(class_="tdb-title-text").text.strip() item["title"] = title pub_time = soup.find( class_="entry-date updated td-module-date").text.strip() item["pub_time"] = Util.format_time2(pub_time) images = [ soup.find("div", "tdb-block-inner td-fix-index").find("img").get("src") ] if soup.find("div", "tdb-block-inner td-fix-index") else None if soup.find_all("div", "wp-block-image"): for img in soup.find_all("div", "wp-block-image"): images.append(img.find("img").get("src")) item["images"] = images abstract = soup.select_one( "div.wpb_wrapper > div > div > p").text.strip() if soup.select_one( "div.wpb_wrapper > div > div > p") else None item["abstract"] = abstract body = soup.find( class_="tdb-caption-text").text.strip() + '\n' if soup.find( class_="tdb-caption-text") else '' for p in soup.select("div.wpb_wrapper > div > div > p"): body += (p.text.strip() + '\n') item["body"] = body self.logger.info(item) self.logger.info('\n') yield item
def parse_2(self, response, **kwargs): item = DemoItem() new_soup = BeautifulSoup(response.text, 'lxml') item['title'] = new_soup.find('div', class_='topHeading', id='12').h1.string bodys = new_soup.select('article .articleBody > p') #文章内容 all_body = '' for body_1 in bodys: all_body += body_1.text item['body'] = all_body item['pub_time'] = time_font( new_soup.select('.articleHd .dateInfo .fl')[0].text).strip("\t") #图片url image_list = [] if len(new_soup.find_all('img', id='jagran_image_id')): for image in new_soup.find_all('img', id='jagran_image_id'): image_list.append( new_soup.find("body").select_one( ".container .ls-area-body article .bodySummery").find( "img").get("data-src")) item['images'] = image_list item['category1'] = new_soup.select( 'aside.breadcrum li.first > a > span')[0].string item['category2'] = new_soup.select( 'aside.breadcrum li:nth-of-type(3) > a > span')[0].text item['abstract'] = new_soup.select( 'aside.breadcrum li:nth-of-type(4) > span')[0].text yield item
def parse_detail(self, response): item = DemoItem() soup = BeautifulSoup(response.text, features='lxml') if soup.select_one('div.date_and_author_container span').text.split(" ")[1]: temp_time = soup.select_one('div.date_and_author_container span').text.split(" ")[1] else: temp_time = soup.select_one('td.miscinfo').text.split(" ")[1] item['pub_time'] = time_adjustment(temp_time) image_list = [] imgs = soup.select('div[align="center"] img') if soup.select('div[align="center"] img') else None if imgs: for img in imgs: image_list.append(img.get('src')) item['images'] = image_list p_list = [] if soup.select('div.newscontent p'): all_p = soup.select('div.newscontent p') else: all_p = soup.select('div[align="justify"]') for paragraph in all_p: p_list.append(paragraph.text) body = '\n'.join(p_list) item['abstract'] = p_list[0] item['body'] = body item['category1'] = response.meta['category'] item['title'] = soup.select_one('div.heading_container').text if soup.select_one('div.heading_container').text else None yield item
def parse_detail(self, response): item = DemoItem() soup = soup = BeautifulSoup(response.text, features="lxml") item['title'] = soup.select_one(".detailBody").find("div", class_="LeftCont content").find( "h1").text.strip() if soup.select_one(".detailBody") else None images = [] image = soup.select_one(".ArticleBodyCont .articleImg").find_all("img") if soup.select_one( ".ArticleBodyCont .articleImg") else None for img in image: images.append(img.get("data-src")) item['images'] = images pub_time = soup.find("div", class_="LeftCont content").find("ul", class_="AuthorInfo").find_all("li")[ -1].text.strip() if soup.find("div", class_="LeftCont content") else None item['pub_time'] = tv9hindi_time_switch2(pub_time) item['abstract'] = soup.find("div", class_="LeftCont content").find_all("p")[1].text.strip() if soup.find("div", class_="LeftCont content") else None body_content = soup.find("div", class_="ArticleBodyCont").find_all("p") if soup.find("div", class_="ArticleBodyCont") else None body = "" mx = '<p><span style="color: #0000ff;">' # 过滤 for p in body_content: if re.match(mx, str(p)) is None: body += p.text.strip() body += "\n" else: pass category = soup.find("div", class_="breadcrum").select_one("#breadcrumbs").find_all("a")[-2:] if soup.find("div", class_="breadcrum") else None item['category1'] = category[0].text.strip() item['category2'] = category[1].text.strip() item['body'] = body item['request_url'] = response.request.url item['response_url'] = response.url item['website_id'] = self.website_id item['language_id'] = self.language_id item['cole_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) yield item
def parse(self, response): html = BeautifulSoup(response.text) if response.url == 'https://www.sunstar.com.ph/Philippines': for i in html.select('.tablecenter > a')[0:8]: yield Request(i.attrs['href']) elif re.findall( r'https://www.sunstar.com.ph/article/\d+/\S+?/\S+?/\S+?', response.url) != []: item = DemoItem() list = response.url.split('/') item['title'] = html.select('.titleArticle > h1')[0].text item['category1'] = list[5] if re.findall(r'\d+', list[6]) == []: item['category2'] = list[6] item['body'] = html.select('.col-sm-11 p')[0].text item['abstract'] = html.select('.col-sm-11 p')[0].text item['pub_time'] = Util.format_time2( html.select('.articleDate')[0].text) if html.select('.imgArticle > img') != []: item['images'] = [ html.select('.imgArticle > img')[0].attrs['src'], ] yield item else: for i in html.select('.sectionTopWidget > div > div .ratio'): yield Request(i.attrs['href']) for i in html.select( '.moreSectionWidget > div > div a[class="title-C20 title blu-hover"]' ): yield Request(i.attrs['href'])
def parse_detail(self, response): item = DemoItem() soup = BeautifulSoup(response.text, features='lxml') temp_time = soup.find( 'span', class_="thetime date updated").text if soup.find( 'span', class_="thetime date updated").text else None adjusted_time = time_adjustment(temp_time) if self.time == None or Util.format_time3(adjusted_time) >= int( self.time): all_text = soup.select_one('div.thecontent').text.replace( '\nAdvertisements\n', '') if soup.select_one('div.thecontent').text.replace( '\nAdvertisements\n', '') else None item['body'] = all_text item['pub_time'] = adjusted_time item['abstract'] = soup.select_one( 'div.thecontent p').text if soup.select_one( 'div.thecontent p').text else None item['category1'] = soup.select_one( 'span.thecategory').text if soup.select_one( 'span.thecategory').text else None item['title'] = soup.select_one( 'header h1').text if soup.select_one( 'header h1').text else None yield item else: self.logger.info("时间截止")
def parse_2(self, response, **kwargs): page_soup = BeautifulSoup(response.text, 'lxml') category1 = page_soup.select( 'div.main-title-outer.pull-left div.main-title')[0].text.strip() item = DemoItem() item['category1'] = category1 item['category2'] = category1 for i in page_soup.select( 'div.col-md-4.col-sm-8.col-xs-16 div.topic.nt_topic a'): yield Request(i.attrs['href'], callback=self.parse_3, meta={'item': item}) if page_soup.select('div.ntdv_pagination li')[-1].find( 'a').attrs['href']: next_page = response.url + page_soup.select( 'div.ntdv_pagination li')[-1].find('a').attrs['href'] last_news_url = BeautifulSoup( requests.get(next_page).text, 'lxml').select( 'div.col-md-4.col-sm-8.col-xs-16 div.topic.nt_topic a' )[-1].attrs['href'] last_time = time_font( BeautifulSoup(requests.get(last_news_url).text, 'lxml').select('div.time')[0].text) if self.time == None or Util.format_time3(last_time) >= int( self.time): # 截止功能 #下一页 yield Request(next_page, callback=self.parse_2) else: self.logger.info('时间截止')
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() category = soup.select('div.breadcrumbs > a') if len(category) == 1: item['category1'] = category[0].text item['category2'] = None else: item['category1'] = category[0].text item['category2'] = category[1].text item['title'] = soup.select('div.breadcrumbs > span')[-1].text ttt = soup.select('dd.published')[0].text.split(',')[1].split(' ')[1:] datetime = ttt[2] + '-' + str( Util.month[ttt[1]]) + '-' + ttt[0] + ' ' + ttt[-1][:5] + ':00' item['pub_time'] = datetime item['images'] = None item['abstract'] = soup.select('div.item-page > p')[0].text ss = '' for i in soup.select('div.item-page > p'): ss += i.text + r'\n' item['body'] = ss yield item
def parse2(self, response): item = DemoItem() html = BeautifulSoup(response.text) list = response.url.split('/') item['title'] = html.select('.title')[0].text item['category1'] = list[3] if re.findall(r'\d+', list[4]) == []: item['category2'] = list[4] item['body'] = '' flag = False for i in html.select('#content-body-244757-498257 > p'): item['body'] += (i.text + '\n') if i.text != '' and flag == False: flag = True item['abstract'] = i.text if html.select('.dateLine > p') != []: item['pub_time'] = Util.format_time2( html.select('.dateLine > p')[0].text) elif html.select('.dateString') != []: item['pub_time'] = Util.format_time2( html.select('.dateString')[0].text) if html.select('.margin-bottom-15 img') != []: item['images'] = [ 'https://www.cnnphilippines.com' + html.select('.margin-bottom-15 img')[0].attrs['src'], ] yield item