def run(self): try: url = 'https://www.countryliving.com/food-drinks/' #input("Enter url to be scrapped: ") # url1 = url.split('/') # print(url1) # #url2 = url1.split('/').pop() # url2 = url1[3] print ('[AnyWebsiteScraper] :: fetching data from url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ("[AnyWebsiteScraper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div22 = soup.find('div',class_='site-content') div11= soup.find('div',class_='feed feed-grid') for div33 in div11.find_all('div',class_='simple-item'): self.scrap_result_row(div33) sleep_scrapper('AnyWebsiteScraper') #infinite scrolling logic start #infinitescrollingUrl ##################### Input here base_url = 'https://www.countryliving.com/ajax/infiniteload/?id=34aae02d-c035-47e5-95c5-b87ba30c1dd8&class=CoreModels%5Csections%5CSectionModel&viewset=section&cachebuster=&page=' for i in range(2, 100, 1): url = base_url + str(i) print ('[AnyWebsiteScraper] :: fetching data from url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ("[AnyWebsiteScraper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div22 = soup.find('div',class_='site-content') div11= soup.find('div',class_='feed feed-grid') for div33 in div11.find_all('div',class_='simple-item'): self.scrap_result_row(div33) sleep_scrapper('AnyWebsiteScraper') #Infinite Scroll Logic Ends except Exception as exp: print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\ % exp) print(traceback.format_exc())
def run(self): try: url = 'https://api.missingkids.org/missingkids' \ '/servlet/PubCaseSearchServlet?' \ 'act=usMapSearch&missState=%s&searchLang=en_US&casedata=' \ 'latest' % self.state print '[MissingKidsScraper] :: fetching data from url: %s' % url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[MissingKidsScraper] :: failed to ' \ 'get content of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for td in soup.find_all('td', width="40%"): self.scrap_result_row(td) # sleep_scrapper('MissingKidsScraper') except Exception as exp: print '[MissingKidsScraper] :: run() :: Got exception: %s' % exp print(traceback.format_exc())
def run(self): try: url = 'https://www.bedbathandbeyond.com/store/category' \ '/%s/%s/%s/%s/' \ % (self.product_category, self.product_subcategory, self.product_title, self.product_code) print '[BedBathAndBeyondScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[BedBathAndBeyondScraper] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='productCo' 'ntent ec_listing'): self.scrap_result_row(div) sleep_scrapper('BedBathAndBeyondScraper') except Exception as exp: print '[BedBathAndBeyondScraper] :: run() :: Got exception: %s'\ % exp print(traceback.format_exc())
def run(self): try: base_url = 'https://www.flipkart.com/search?as=off&as-show=' \ 'on&otracker=start&page=' sufix = '&q=%s&viewType=list' % self.product for i in range(1, 100, 1): url = base_url + str(i) + sufix print '[FlipkartScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[FlipkartScraper] :: Failed to get the content ' \ 'of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # for div in soup.find_all('div', class_='col col-7-12'): for div in soup.find_all('div', class_='_1-2Iqu row'): # print '---------------------div', div self.scrap_result_row(div) sleep_scrapper('FlipkartScraper') except Exception as exp: print '[FlipkartScraper] :: run() :: Got exception: %s' % exp print(traceback.format_exc())
def run(self): base_url = 'https://www.homedepot.com/b/' \ '%s/N-5yc1vZbm79?Nao=' % (self.product) sufix = '&Ns=None' for j in range(0, 1000, 12): url = '' try: url = base_url + str(j) + sufix print '[HomeDepot] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[HomeDepot] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='pod-inner'): self.scrap_result_row(div) sleep_scrapper('HomeDepot') except Exception as exp: print '[HomeDepot] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url)
def run(self): base_url = 'https://www.indeed.co.in/jobs?q=' \ '%s&l=%s&start=' % (self.post, self.location) for j in range(0, 1000, 10): url = '' try: url = base_url + str(j) print '[IndeedScrapper] :: fetching data from url:', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[IndeedScrapper] :: Failed to " \ "get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div'): # ignore divs with classes if not div.attrs.has_key('class'): continue cls = div.attrs['class'] if 'row' in cls and 'result' in cls: self.scrap_result_row(div) # break sleep_scrapper('IndeedScraper') except Exception as exp: print '[IndeedScraper] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url)
def run(self): url = '' try: base_url = 'https://www.overstock.com/Home-Garden/%s/%s/' \ % (self.product_category, self.product_code) sufix = 'subcat.html?page=' for j in range(1, 100, 1): url = base_url + sufix + str(j) print '[OverStockScraper] :: fetching data from url:', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[OverStockScraper] :: Failed to " \ "get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='product-tile'): # print '---------div', div self.scrap_result_row(div) # break sleep_scrapper('OverStockScraper') except Exception as exp: print '[OverStockScraper] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url) print(traceback.format_exc())
def run(self): base_url = "https://www.yelp.com/search?find_desc=" \ "Dry+Cleaners&find_loc=New+York%2C+NY&start=" for j in range(1, 1000, 10): try: url = base_url + str(j) print '[YelpScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "Failed to get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') li_class = "regular-search-result" # parsing html content to fet information about dry cleaners for li in soup.find_all('li', class_=li_class): self.scrap_row_yelp(li) # break # just use it for testing only sleep_scrapper('YelpScraper') except Exception as exp: print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp print(traceback.format_exc())
def run(self): try: base_url = 'https://inc42.com/buzz/' r = requests.get(base_url, headers=get_request_headers()) print('[IncScrapper] :: fetching data from TEAMS url: ', base_url) if not r.status_code == 200: print ("[IncScrapper] :: Failed to get " \ "content of url: %s" % base_url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div1 = soup.find('div', class_="site-content") inc_news = div1.find_all("div", {"class": "card-wrapper horizontal-card"}) for news in inc_news: self.scrape_home(news) sleep_scrapper('Inc42scrapper') #next pages data for i in range(2, 100, 1): page = 'page' url = base_url + page + str(i) r = requests.get(url, headers=get_request_headers()) print('[IncScrapper] :: fetching data from TEAMS url: ', url) if not r.status_code == 200: print ("[IncScrapper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div1 = soup.find('div', class_="site-content") inc_news = div1.find_all( "div", {"class": "card-wrapper horizontal-card"}) for news in inc_news: self.scrape_home(news) sleep_scrapper('Inc42scrapper') except Exception as exp: print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 Homepage: %s'\ % exp) print(traceback.format_exc())
def scrap_yellowpages(self, url, scrap_type): logging.info("\nScrapping YellowPages Url: %s\n" % url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: logging.error("Failed to get content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div_class = "info" for div in soup.find_all('div', class_=div_class): try: self.scrap_row_yellowpages(div, scrap_type) except Exception as exp: logging.error("scrap_yellowpages() :: Got exception: %s" % exp) logging.error(traceback.format_exc())
def scrap_groupon(self, url, scrap_type): logging.info("\n=======> Scrapping Groupon Url: %s \n" % url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: logging.error( "scrap_groupon() :: Failed to get content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div_class = "cui-content c-bdr-gray-clr ch-bdr-gray-md" # parsing html content to fet information about dry cleaners for div in soup.find_all('div', class_=div_class): try: self.scrap_row_groupon(div, scrap_type) except Exception as exp: logging.error("scrap_groupon() :: Got exception: %s" % exp) logging.error(traceback.format_exc())
def scrap_yelp(self, url, scrap_type): logging.info("\nScrapping Yelp Url: %s \n" % url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: logging.error("Failed to get content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') li_class = "regular-search-result" # parsing html content to fet information about dry cleaners for li in soup.find_all('li', class_=li_class): try: self.scrap_row_yelp(li, scrap_type) except Exception as exp: logging.error("scrap_yelp() :: Got exception: %s" % exp) logging.error(traceback.format_exc())
def scrap_python_developer(self, url): print "\nScrapping python Developer: %s \n" % url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "Failed to get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # parsing html content to fet information about python developer # for div in soup.find_all('div', class_='brdr'): for div in soup.find_all('div'): # ignore divs with classes if not div.attrs.has_key('class'): continue cls = div.attrs['class'] if 'row' in cls and 'result' in cls: self.scrap_result_row(div)
def run(self): base_url = 'https://www.yellowpages.com/search?search_terms=software+company&geo_location_terms=New+York%2C+NY&page=' for j in range(27, 100, 1): try: url = base_url + str(j) print ('[YellowPagesScraper] :: fetching data from url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ('[YellowPagesScraper] :: Failed to get the content ' \ 'of url: %s' % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='info'): self.scrap_result_row(div) sleep_scrapper('YellowPagesScraper') except Exception as exp: print ('[YellowPagesScraper] :: run() :: Got exception: %s' % exp) print(traceback.format_exc())
def run(self): base_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s,+NY&start=" % ( self.product, self.location) for j in range(1, 1000, 10): try: url = base_url + str(j) print '[YelpScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[YelpScraper] :: Failed to get content of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for li in soup.find_all('li', class_='regular-search-result'): self.scrap_row_yelp(li) sleep_scrapper('YelpScraper') except Exception as exp: print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp print(traceback.format_exc())
def run(self): for j in range(0, 2, 1): try: # url = base_url + str(j) + sufix url = 'https://www.samsclub.com/sams/coffee-tea-cocoa/1493.cp?xid=cat_sub&navAction=jump' print '[Samsclub] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[Samsclub] :: Failed to get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # parsing html content to fet information about python developer for div in soup.find_all('div', class_='products-card'): self.scrap_result_row(div) sleep_scrapper('Samsclub') except Exception as exp: print '[Samsclub] :: run() :: Got exception : %s' % exp
def run(self): try: url = 'https://news.google.com/news/headlines/section/topic' \ '/NATION.en_in/India?ned=in&hl=en-IN&gl=IN' print '[GoogleNewsScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[GoogleNewsScraper] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # print '------soup', soup for div in soup.find_all('div', class_='v4IxVd'): # print '-----div', div self.scrap_result_row(div) sleep_scrapper('GoogleNewsScraper') except Exception as exp: print '[GoogleNewsScraper] :: run() :: Got exception: %s'\ % exp print(traceback.format_exc())
def scrap_result_row(self, div33): try: #blog author name #blog_link div44=div33.find('a',{'class':'simple-item-image item-image'}) link=div33.find('a',{'class':'simple-item-image item-image'})['href'] #making blog folder starts here #blog_website prefix='https://www.countryliving.com' blog_link = prefix + link blog_name=link.split('/') file1=(blog_name[3]) blog1_name=blog_name directory = file1 parent_dir = '/home/soumya/Documents/scrappeddata/Food-Drink' #input("Enter the folder path where to store data: ") blog_path = os.path.join(parent_dir,directory) os.mkdir(blog_path) print('[AnyWebsiteScraper] :: blog_Folder has been created:',blog_path) path1=os.chdir(blog_path) #for blog_posting_date div66=div33.find('div',class_='simple-item-metadata') blog_date=div66.find('div',class_='publish-date simple-item-publish-date js-date').text.strip() date_string = blog_date # date_time_obj = datetime.datetime.strptime(date_time_str, '%b %d %Y %I:%M%p') print("date_string =", date_string) print("type of date_string =", type(date_string)) date_object = datetime.datetime.strptime(date_string, "%b %d, %Y") print("date_object =", date_object) date = str(date_object) print("type of date_object =", type(date_object)) datetime_obj = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") datetime_obj_utc = datetime_obj.replace(tzinfo=timezone('UTC')) UTC_blog_date=datetime_obj_utc.strftime("%Y-%m-%d %H:%M:%S %Z%z") print('[AnyWebsiteScraper] :: blog_date:',UTC_blog_date) #blog_picture picture=div44.find('span')['data-lqip'] picture1=picture.split('?') blog_picture=(picture1[0]) blog_image_name=blog_picture.split('/').pop() print(blog_image_name) raw1_media=requests.get(blog_picture , stream=True) with open(blog_image_name,"wb") as f: f.write(raw1_media.content) print('IMAGE DOWNLOADED',f) print('current working directory',os.getcwd()) path=os.getcwd() directory=blog_image_name path = os.path.join(path,directory) image_list=[] video_list=[] pdf_list=[] blog_author = '' blog_subtitle = '' image_list.append(path) print('[AnyWebsiteBlogScraper] :: blog_description :: BLOG_media:',path) #for blog_title blog_title=div33.find('a',class_='simple-item-title item-title').text.strip() print('[AnyWebsiteScraper] :: blog_title:',blog_title) #for blog_short_desc blog_short_desc=div33.find('div',class_='simple-item-dek item-dek').text.strip() blog_subtitle = blog_short_desc print('[AnyWebsiteScraper] :: blog_short_desc:',blog_subtitle) #Making blog folder ends here #recursively getting all blog data print('[AnyWebsiteScraper] :: blog_link:',blog_link) try: url = blog_link print ('[AnyWebsiteScraper] :: fetching data from blog_link: ', url) r = requests.get(url, headers=get_request_headers()) print(r,'rrrrrr') if not r.status_code == 200: print ("[AnyWebsiteScraper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div111=soup.find('div',class_='site-content') #extratcion rule for normal blogs print('Normal blog:::::::.............') #blog_author name author=div111.find('div',class_='content-info-metadata') auth1=author.find('div',class_='byline-with-image') blog_author=auth1.find('div',class_='byline').text.strip() blog_author1 = (blog_author) print('[AnyWebsiteBlogScraper] :: blog_author:',blog_author1) div2222=div111.find('div',class_='content-container standard-container') if(div2222): blog_description='' div333=div2222.find('div',class_='standard-body') div8899=div333.find('div',class_='content-lede-image-wrap aspect-ratio-freeform') div8889=div333.find('div',class_='article-body-content standard-body-content') blog_description=div8889.find('p',class_='body-text').text.strip() blog_desc = blog_description + '\n' + blog_author1 print('[AnyWebsiteBlogScraper] :: blog_description:', blog_desc.encode('utf-8')) else: pass try: #extraction rule for slides container blogs blog_description='' div777=div111.find('div',class_='slideshow-outer') div456=div777.find('div',class_='slideshow-lede active') blog_description=div456.find('div',class_='slideshow-desktop-dek').text.strip() blog_desc = blog_description + '\n' + blog_author1 print('[AnyWebsiteBlogScraper] :: blog_description:',blog_desc.encode('utf-8')) except: pass if(div777): print('Slides Blog:::::::::') #self.Slide_blog(div777) except AttributeError as e: print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\ % e) blog_author = '' url = blog_link print ('[AnyWebsiteScraper] :: fetching data from blog_link: ', url) r = requests.get(url, headers=get_request_headers()) print(r,'rrrrrr') if not r.status_code == 200: print ("[AnyWebsiteScraper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div111=soup.find('div',class_='site-content') #extratcion rule for normal blogs print('Normal blog:::::::.............') #blog_author name blog_author1 = (blog_author) print('[AnyWebsiteBlogScraper] :: blog_author:',blog_author1) div2222=div111.find('div',class_='content-container standard-container') if(div2222): blog_description='' div333=div2222.find('div',class_='standard-body') div8899=div333.find('div',class_='content-lede-image-wrap aspect-ratio-freeform') div8889=div333.find('div',class_='article-body-content standard-body-content') blog_description=div8889.find('p',class_='body-text').text.strip() blog_desc = blog_description + '\n' + blog_author1 print('[AnyWebsiteBlogScraper] :: blog_description:', blog_desc.encode('utf-8')) else: pass try: #extraction rule for slides container blogs blog_description='' div777=div111.find('div',class_='slideshow-outer') div456=div777.find('div',class_='slideshow-lede active') blog_description=div456.find('div',class_='slideshow-desktop-dek').text.strip() blog_desc = blog_description + '\n' + blog_author1 print('[AnyWebsiteBlogScraper] :: blog_description:',blog_desc.encode('utf-8')) except: pass if(div777): print('Slides Blog:::::::::') data = { "blog_title":blog_title , "blog_sub_title":blog_subtitle, "blog_author":blog_author, "blog_date":UTC_blog_date, "blog_url":blog_link, "blog_text":blog_description, "blog_storage_path":blog_path, "blog_image_name":blog_image_name, "blog_image_list":image_list, "blog_video_list":video_list, "blog_pdf_list":pdf_list } with open('personal.json', 'w') as json_file: print('json file is created') json.dump(data, json_file) data = { "blog_title":blog_title , "blog_sub_title":blog_subtitle, "blog_author":blog_author, "blog_date":UTC_blog_date, "blog_url":blog_link, "blog_text":blog_description, "blog_storage_path":blog_path, "blog_image_name":blog_image_name, "blog_image_list":image_list, "blog_video_list":video_list, "blog_pdf_list":pdf_list } with open('personal.json', 'w') as json_file: print('json file is created') json.dump(data, json_file) # except Exception as exp: # print ('[AnyWebsiteScraper] :: scrap_result_row() :: ' \ # 'Got exception : %s' % exp) # print(traceback.format_exc()) except UnicodeEncodeError as exp: print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\ % exp) blog_subtitle.encode('utf-8') url = blog_link print ('[AnyWebsiteScraper] :: fetching data from blog_link: ', url) r = requests.get(url, headers=get_request_headers()) print(r,'rrrrrr') if not r.status_code == 200: print ("[AnyWebsiteScraper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div111=soup.find('div',class_='site-content') #extratcion rule for normal blogs print('Normal blog:::::::.............') #blog_author name author=div111.find('div',class_='content-info-metadata') auth1=author.find('div',class_='byline-with-image') blog_author=auth1.find('div',class_='byline').text.strip() blog_author1 = (blog_author) print('[AnyWebsiteBlogScraper] :: blog_author:',blog_author1) div2222=div111.find('div',class_='content-container standard-container') if(div2222): div333=div2222.find('div',class_='standard-body') div8899=div333.find('div',class_='content-lede-image-wrap aspect-ratio-freeform') div8889=div333.find('div',class_='article-body-content standard-body-content') blog_description=div8889.find('p',class_='body-text').text.strip() blog_desc = blog_description + '\n' + blog_author1 print('[AnyWebsiteBlogScraper] :: blog_description:', blog_desc.encode('utf-8')) else: pass try: #extraction rule for slides container blogs div777=div111.find('div',class_='slideshow-outer') div456=div777.find('div',class_='slideshow-lede active') blog_description=div456.find('div',class_='slideshow-desktop-dek').text.strip() blog_desc = blog_description + '\n' + blog_author1 print('[AnyWebsiteBlogScraper] :: blog_description:',blog_desc.encode('utf-8')) except: pass if(div777): print('Slides Blog:::::::::') data = { "blog_title":blog_title , "blog_sub_title":blog_subtitle, "blog_author":blog_author, "blog_date":UTC_blog_date, "blog_url":blog_link, "blog_text":blog_description, "blog_storage_path":blog_path, "blog_image_name":blog_image_name, "blog_image_list":image_list, "blog_video_list":video_list, "blog_pdf_list":pdf_list } with open('personal.json', 'w') as json_file: print('json file is created') json.dump(data, json_file)
def scrape_home_sections(self, section_heading_url): try: print('[ESPNscrapper] :: fetching data from section url :', section_heading_url) r = requests.get(section_heading_url, headers=get_request_headers()) if not r.status_code == 200: print ("[ESPNScrapper] :: Failed to get " \ "content of url: %s" % section_heading_url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') col2_feed = soup.find('section', class_='col-two contentFeed') contentf = col2_feed.find_all("section", {"class": "contentItem"}) for contentfeed in contentf: div_content = contentfeed.find_all( 'section', class_= 'contentItem__content contentItem__content--story has-image has-video contentItem__content--collection' ) if (div_content): for post in div_content: news_1 = post.find( "a", { "class": "contentItem__padding contentItem__padding--border" }) news_url = (news_1)['href'] print(news_url) self.scrape_section_posts(news_url) else: print('section has video NEWS only !!') contentf = col2_feed.find_all("article", {"class": "contentItem"}) for contentfeed in contentf: div_content = contentfeed.find_all( 'section', class_= 'contentItem__content contentItem__content--story has-image has-video contentItem__content--collection' ) if (div_content): print( 'now scrapping posts from articles in Home_Sections_scrapping' ) for post in div_content: news_1 = post.find( "a", { "class": "contentItem__padding contentItem__padding--border" }) news_url = (news_1)['href'] print(news_url) self.scrape_section_posts(news_url) else: print('section has video NEWS only !!') except AttributeError as e: print( 'THIS Section is scraped already !!!.....Sorry, cant get data: %s' ) print ('[ESPNscrapper] :: run() :: Got exception at fetching data from section url: %s'\ % e) print(traceback.format_exc()) except TypeError as er: print('THIS Section is For Viewing Ads !!: %s') print ('[ESPNscrapper] :: run() :: Got exception at fetching data from section url: %s'\ % er) print(traceback.format_exc()) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from section url: %s'\ % exp) print(traceback.format_exc())
def scrape_section_posts(self, news_url): try: if not news_url.startswith('http'): url = 'https://www.espn.in/cricket/' _url = url.split('/') _1url = _url[0] _2url = _url[2] _3url = '//' base_url = _1url + _3url + _2url post_url = base_url + news_url news_url = post_url print('home secton post url') print('[ESPNHOMEScrapper] :: fetching data from HOME POST url: ', news_url) r = requests.get(news_url, headers=get_request_headers()) if not r.status_code == 200: print ("[ESPNHOMEScrapper] ::Posts URL Failed to get " \ "content of post_url: %s" % news_url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') sect1 = soup.find('section', id='pane-main') sect11 = sect1.find('section', id='main-container') sect111 = sect11.find("div", {"class": "main-content"}) sect2 = sect111.find('section', id='article-feed') div555 = sect2.find('div', class_='container') article_header = div555.find('header', class_='article-header').text.strip() art__ = div555.find('div', class_='article-body') art_ = art__.find('div', class_='article-meta') POST_DATE = art_.find('span', class_='timestamp') POST_UTC_DATE = (POST_DATE)['data-date'] print( '[ESPNScrapper] :: fetching data from Post :: UTC TIMESTAMP : ', POST_UTC_DATE) utc_date = POST_UTC_DATE.split('T') utc_date1 = utc_date[1].split(':') print('utc_date1', utc_date1) fold = utc_date1[0] + ' ' + utc_date1[1] + ' ' + utc_date1[2] utc = utc_date[0] + ' ' + fold print('folder name', utc) directory = utc parent_dir = 'C:/Users/lenovo/Desktop/scholarsbook_scrappers_data/ESPN.in/cricket/' #input("Enter the folder path where to store data: ") blog_path = os.path.join(parent_dir, directory) os.mkdir(blog_path) print('[AnyWebsiteScraper] :: blog_Folder has been created:', blog_path) path1 = os.chdir(blog_path) print('currentpath', os.getcwd()) print('[ESPNScrapper] :: fetching data from Post :: header : ', article_header) img1 = div555.find('figure', class_='article-figure dim16x9') pic__ = '' if (img1): img2 = img1.find('div', class_='img-wrap') pic__ = img2.find('source')['srcset'] pic_ = pic__.split('?').pop() pic11 = pic_.split('2F').pop() pic111 = pic11.split('&') image_name = (pic111[0]) print( '[ESPNScrapper] :: fetching data from Post :: Image_URL : ', pic__) raw1_media = requests.get(pic__, stream=True) with open(image_name, "wb") as f: f.write(raw1_media.content) print('IMAGE DOWNLOADED', f) else: print('IT a video post.......') aside = art__.find('aside', class_='inline inline-photo full') if (aside): aside_ = aside.find('figure') videoPostImage = aside_.find('source')['data-srcset'] raw1_media = requests.get(videoPostImage, stream=True) with open(image_name, "wb") as f: f.write(raw1_media.content) print('IMAGE DOWNLOADED', f) print( '[ESPNScrapper] :: fetching data from Post :: Image_URL : ', videoPostImage) else: print( 'This post is a live report . Sorry not HAVE any image' ) POST_LOCAL_TIME = (POST_DATE).text.strip() print( '[ESPNScrapper] :: fetching data from Post :: POST_LOCAL_TIME : ', POST_LOCAL_TIME) post_author = art_.find('ul', class_='authors').text.strip() print( '[ESPNScrapper] :: fetching data from Post :: POST AUTHOR : ', post_author) po = (art__).text.strip() print('[ESPNScrapper] :: fetching data from Post :: POST TEXT : ', po) self.parse_data_to_json(news_url, article_header, pic__, POST_UTC_DATE, POST_LOCAL_TIME, post_author, po) except Exception as exp: print ('[POSTSCRAPPER] :: run() :: Got exception at HOME NEWS_url: %s'\ % exp) print(traceback.format_exc())
def scrape_home(self, news): try: news_image = news.find('figure', class_='card-image') news__ = news_image.find('a') news_ = (news__['href']) news_l = news_.split('?') news_link = news_l[0] news_lin = news_link.split('/') folder_name = news_lin[4] directory = folder_name parent_dir = 'C:/Users/lenovo/Desktop/scholarsbook_scrappers_data/Inc42/' #input("Enter the folder path where to store data: ") blog_path = os.path.join(parent_dir, directory) os.mkdir(blog_path) print('[AnyWebsiteScraper] :: blog_Folder has been created:', blog_path) os.chdir(blog_path) #print('currentpath',os.getcwd()) print('[Inc42Scrapper]::::::::::: NEWS_Link', news_link) img = news__.find('img')['src'] img_ = img.split('?') News_Image_url = img_[0] print('[Inc42Scrapper]::::::::::: NEWS_Image_URL', News_Image_url) News_Image = News_Image_url.split('/').pop() raw1_media = requests.get(News_Image_url, stream=True) with open(News_Image, "wb") as f: f.write(raw1_media.content) print('IMAGE DOWNLOADED', f) news_title_ = news.find('div', class_='card-content') news_title = news_title_.find('h2', class_='entry-title').text.strip() print('[Inc42Scrapper]::::::::::: NEWS_Title', news_title) news_url = news_link #parsing the news-link try: r = requests.get(news_url, headers=get_request_headers()) print('[Inc42Scrapper] :: fetching data from TEAMS url: ', news_url) if not r.status_code == 200: print ("[Inc42Scrapper] :: Failed to get " \ "content of url: %s" % news_url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div1_ = soup.find('div', class_='site-content') div5_ = div1_.find( 'div', class_= 'meta-wrapper single-meta-wrapper single-meta-wrapper-top entry-meta clearfix' ) div7_ = div5_.find( 'div', class_='post-meta large-7 medium-6 small-12 columns') #print(div7_) news_author = div7_.find( 'div', class_='author-name large').text.strip() #news-author print('[Inc42Scrapper]::::::::::: NEWS_Author_name', news_author) news_date = div5_.find('div', class_='date').text.strip() #news_date print('[Inc42Scrapper]::::::::::: NEWS_date', news_date) #news-subtitle date_object = datetime.datetime.strptime( news_date, "%b %d, %Y") #print("date_object =", date_object) date = str(date_object) datetime_obj = datetime.datetime.strptime( date, "%Y-%m-%d %H:%M:%S") print(datetime_obj) datetime_obj_utc = datetime_obj.replace(tzinfo=timezone('UTC')) UTC_blog_date = datetime_obj_utc.strftime( "%Y-%m-%d %H:%M:%S %Z%z") print('BlOG________________DATE', UTC_blog_date) news_subtitles = div1_.find( 'div', class_='single-post-summary').text.strip() print('[Inc42Scrapper]::::::::::: NEWS_subtitles', news_subtitles) #news-description news_text = div1_.find( 'div', class_='entry-content clearfix').text.strip() print('[Inc42Scrapper]::::::::::: NEWS_Description', news_text) except AttributeError: print('this post dont have author') news_author = '' print('[Inc42Scrapper]::::::::::: NEWS_author', news_date) #news-subtitle news_subtitles = div1_.find( 'div', class_='single-post-summary').text.strip() print('[Inc42Scrapper]::::::::::: NEWS_subtitles', news_subtitles) #news-description news_text = div1_.find( 'div', class_='entry-content clearfix').text.strip() print('[Inc42Scrapper]::::::::::: NEWS_Description', news_text) except Exception as exp: print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 post url: %s'\ % exp) print(traceback.format_exc()) self.parse_to_json(news_link, news_title, news_subtitles, News_Image_url, news_author, news_date, blog_path, news_text) except Exception as exp: print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 Homepage: %s'\ % exp) print(traceback.format_exc())
def run(self): try: #Teams news scrapping 'https://www.espn.in/cricket/team/_/id/1/england', # 'https://www.espn.in/cricket/team/_/id/2/australia', # 'https://www.espn.in/cricket/team/_/id/3/south-africa', # 'https://www.espn.in/cricket/team/_/id/4/west-indies', # 'https://www.espn.in/cricket/team/_/id/5/new-zealand', # 'https://www.espn.in/cricket/team/_/id/6/india', # 'https://www.espn.in/cricket/team/_/id/7/pakistan', # 'https://www.espn.in/cricket/team/_/id/8/sri-lanka', # 'https://www.espn.in/cricket/team/_/id/9/zimbabwe', # 'https://www.espn.in/cricket/team/_/id/25/bangladesh/', cricketTeams_URLs = [ 'http://www.espn.in/cricket/team/_/id/1/england', 'http://www.espn.in/cricket/team/_/id/2/australia', 'http://www.espn.in/cricket/team/_/id/3/south-africa', 'http://www.espn.in/cricket/team/_/id/4/west-indies', 'http://www.espn.in/cricket/team/_/id/5/new-zealand', 'http://www.espn.in/cricket/team/_/id/6/india', 'http://www.espn.in/cricket/team/_/id/7/pakistan', 'http://www.espn.in/cricket/team/_/id/8/sri-lanka', 'http://www.espn.in/cricket/team/_/id/9/zimbabwe', 'http://www.espn.in/cricket/team/_/id/25/bangladesh/' ] for url in cricketTeams_URLs: print("PART_________________________________A") print('[ESPNScrapper] :: fetching data from TEAMS url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ("[ESPNScrapper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') sect1 = soup.find('section', id='pane-main') sect11 = sect1.find('section', id='main-container') sect2 = sect11.find('section', id='news-feed') news_feed_list = sect2.find('div', class_='container') section_news = news_feed_list.find_all( "article", {"class": "news-feed-item news-feed-story-package"}) Headlines = news_feed_list.find_all("article", { "class": "news-feed-item news-feed-story-package is-headline" }) team_parsed_urls = [] for news in Headlines: print("PART______________________________A.A") news_ = news.find("a", {"class": "story-link"}) if (news): news_url = (news_)['data-popup-href'] #self.scrape_post_content(news_url) team_parsed_urls.append(news_url) for news in section_news: print("PART______________________________A.B") print( 'You are now scrapping regular news fro webpage !!.. ') news_ = news.find("a", {"class": "story-link"}) if (news): news_url = (news_)['data-popup-href'] print('[ESPNScrapper] :: section news URL: ', news_url) #self.scrape_post_content(news_url) team_parsed_urls.append(news_url) infnite_scroll_url = 'https://secure.espn.com/core/minifeed?render=true&partial=newsfeed&limit=20&xhr=1&template=clubhouse&headlinestack=true&site=espn&lang=en®ion=in&sport=cricket&pubkey=cricket-clubhouse&insider=false&device=desktop&country=in&lang=en®ion=in&site=espn&edition-host=espn.in&site-type=full&userab=0&offset=' ur_l = url.split('/') print('url to be scrolled infinite', ur_l) team = '&team=' + ur_l[7] for i in range(0, 100, 25): scroll_url = infnite_scroll_url + str(i) + team print( "PART____________AUTO-SCROLL______________________A.C") print( '[ESPNScrapper] :: fetching data from infinite-url: ', scroll_url) try: raw_json = requests.get(scroll_url).text data = json.loads(raw_json) qw = (data['content']['html']['items'][0]['html']) for data in data['content']['html']['items']: qw = data['html'] try: qwe = json.dumps(qw) soup = BeautifulSoup(qwe, 'html.parser') section_ = soup.find("a")['data-popup-href'] print(section_) sect = section_.replace('\\"', '') if re.search('clip', sect): print( "NEWS only contains video, no text , no image , so skipping this News" ) else: team_parsed_urls.append(sect) self.scrape_post_content(sect) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) sleep_scrapper('ESPNScrapper') sleep_scrapper('ESPNScrapper') base_url = 'https://www.espn.in/cricket/' parent_fol = base_url.split('/') print(parent_fol) print("PART_______________________________B") print('[ESPNScrapper] :: fetching data from BASE Url: ', base_url) ree = requests.get(base_url, headers=get_request_headers()) if not ree.status_code == 200: print ("[ESPNScrapper] :: Failed to get " \ "content of url: %s" % base_url) return html_doc = ree.content soup = BeautifulSoup(html_doc, 'html.parser') col2_feed = soup.find('section', class_='col-two contentFeed') contentf = col2_feed.find_all("section", {"class": "contentItem"}) for content in contentf: print('contentfeedsections') self.scrape_sports(content, cricketTeams_URLs) contentf = col2_feed.find_all("article", {"class": "contentItem"}) for contentfeed in contentf: print('contentfeedARTICLES') self.scrape_sports(contentfeed, cricketTeams_URLs) #scrape only 26 posts of the starting webpage #Infinite loading URL infinite_url = 'https://onefeed.fan.api.espn.com/apis/v3/cached/contentEngine/oneFeed/leagues/cricket?source=ESPN.com%2B-%2BFAM&showfc=true®ion=in&limit=15&lang=en&authorizedNetworks=espn_free&editionKey=espnin-en&device=desktop&pubkey=espncricinfo-en-in-cricket-index&isPremium=true&locale=in&featureFlags=expandAthlete&featureFlags=mmaGB&offset=' #for 10 times scrolling for i in range(10, 100, 15): scroll_url = infinite_url + str(i) print( '[ESPNScrapper] :: fetching data from infinite-url: ', scroll_url) r = requests.get(scroll_url, headers=get_request_headers()) try: raw_json = requests.get(scroll_url).text dataa = json.loads(raw_json) for data in dataa['feed']: qw = (data['data']['now'][0]) try: keys = sorted(qw.items()) result = [(key, value) for key, value in keys if key.startswith("links")] result11 = result[0] result33 = list(result11) reo = result33[1] scroll_news_url = reo['web']['href'] print('FETCHING post from scroll URL') self.scrape_post_content(scroll_news_url) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) sleep_scrapper('ESPNScrapper') except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc())