Python get_request_headers Exemples, utils.get_request_headers Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : country_living_food-drinks_scrapper.py Projet : The-whole-solutions/Data-Scrappers

    def run(self):
        try:

            url = 'https://www.countryliving.com/food-drinks/' #input("Enter url to be scrapped: ")
            # url1 = url.split('/')
            # print(url1)
            # #url2 = url1.split('/').pop()
            # url2 = url1[3]
            
            

            print ('[AnyWebsiteScraper] :: fetching data from url: ', url)
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print ("[AnyWebsiteScraper] :: Failed to get " \
                        "content of url: %s" % url)
                return
            html_doc = r.content
            soup = BeautifulSoup(html_doc, 'html.parser')
            div22 = soup.find('div',class_='site-content')
            div11= soup.find('div',class_='feed feed-grid')
            for div33 in div11.find_all('div',class_='simple-item'):
                self.scrap_result_row(div33)
            sleep_scrapper('AnyWebsiteScraper')
     #infinite scrolling logic start
                        #infinitescrollingUrl ##################### Input here 
            base_url = 'https://www.countryliving.com/ajax/infiniteload/?id=34aae02d-c035-47e5-95c5-b87ba30c1dd8&class=CoreModels%5Csections%5CSectionModel&viewset=section&cachebuster=&page='

            for i in range(2, 100, 1):
                url = base_url + str(i)
                
                print ('[AnyWebsiteScraper] :: fetching data from url: ', url)
                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print ("[AnyWebsiteScraper] :: Failed to get " \
                        "content of url: %s" % url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                div22 = soup.find('div',class_='site-content')
                div11= soup.find('div',class_='feed feed-grid')
                for div33 in div11.find_all('div',class_='simple-item'):
                    self.scrap_result_row(div33)
                sleep_scrapper('AnyWebsiteScraper')

    #Infinite Scroll Logic Ends            
        except Exception as exp:
            print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\
                  % exp)
            print(traceback.format_exc())

Exemple #2

0

Afficher le fichier

Fichier : scraper.py Projet : incomparable1142/survey

    def run(self):

        try:

            url = 'https://api.missingkids.org/missingkids' \
                  '/servlet/PubCaseSearchServlet?' \
                  'act=usMapSearch&missState=%s&searchLang=en_US&casedata=' \
                  'latest' % self.state

            print '[MissingKidsScraper] :: fetching data from url: %s' % url

            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print '[MissingKidsScraper] :: failed to ' \
                      'get content of url: %s' % url

                return

            html_doc = r.content
            soup = BeautifulSoup(html_doc, 'html.parser')
            for td in soup.find_all('td', width="40%"):
                self.scrap_result_row(td)

            # sleep_scrapper('MissingKidsScraper')

        except Exception as exp:
            print '[MissingKidsScraper] :: run() :: Got exception: %s' % exp
            print(traceback.format_exc())

Exemple #3

0

Afficher le fichier

Fichier : bedBathAndBeyond_scraper.py Projet : incomparable1142/Gaurav-scrapping

    def run(self):
        try:

            url = 'https://www.bedbathandbeyond.com/store/category' \
                  '/%s/%s/%s/%s/' \
                  % (self.product_category, self.product_subcategory,
                     self.product_title, self.product_code)

            print '[BedBathAndBeyondScraper] :: fetching data from url: ', url
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print "[BedBathAndBeyondScraper] :: Failed to get " \
                        "content of url: %s" % url
                return
            html_doc = r.content

            soup = BeautifulSoup(html_doc, 'html.parser')

            for div in soup.find_all('div',
                                     class_='productCo'
                                     'ntent ec_listing'):
                self.scrap_result_row(div)
            sleep_scrapper('BedBathAndBeyondScraper')
        except Exception as exp:
            print '[BedBathAndBeyondScraper] :: run() :: Got exception: %s'\
                  % exp
            print(traceback.format_exc())

Exemple #4

0

Afficher le fichier

Fichier : flipkart_scraper.py Projet : incomparable1142/Gaurav-scrapping

    def run(self):

        try:
            base_url = 'https://www.flipkart.com/search?as=off&as-show=' \
                       'on&otracker=start&page='
            sufix = '&q=%s&viewType=list' % self.product

            for i in range(1, 100, 1):
                url = base_url + str(i) + sufix
                print '[FlipkartScraper] :: fetching data from url: ', url

                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print '[FlipkartScraper] :: Failed to get the content ' \
                          'of url: %s' % url
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                # for div in soup.find_all('div', class_='col col-7-12'):
                for div in soup.find_all('div', class_='_1-2Iqu row'):
                    # print '---------------------div', div
                    self.scrap_result_row(div)
                sleep_scrapper('FlipkartScraper')
        except Exception as exp:
            print '[FlipkartScraper] :: run() :: Got exception: %s' % exp
            print(traceback.format_exc())

Exemple #5

0

Afficher le fichier

Fichier : HomeDepot.py Projet : Priyankadev/scrapit

    def run(self):

        base_url = 'https://www.homedepot.com/b/' \
                   '%s/N-5yc1vZbm79?Nao=' % (self.product)
        sufix = '&Ns=None'

        for j in range(0, 1000, 12):
            url = ''
            try:
                url = base_url + str(j) + sufix
                print '[HomeDepot] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print "[HomeDepot] :: Failed to get " \
                          "content of url: %s" % url
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div', class_='pod-inner'):
                    self.scrap_result_row(div)
                sleep_scrapper('HomeDepot')
            except Exception as exp:
                print '[HomeDepot] :: run() :: Got exception : ' \
                      '%s and fetching data from url: %s' % (exp, url)

Exemple #6

0

Afficher le fichier

Fichier : indeed_scraper.py Projet : manisharmagarg/scrapping_scripts

    def run(self):

        base_url = 'https://www.indeed.co.in/jobs?q=' \
              '%s&l=%s&start=' % (self.post, self.location)
        for j in range(0, 1000, 10):
            url = ''
            try:
                url = base_url + str(j)
                print '[IndeedScrapper] :: fetching data from url:', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[IndeedScrapper] :: Failed to " \
                          "get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div'):
                    # ignore divs with classes
                    if not div.attrs.has_key('class'):
                        continue

                    cls = div.attrs['class']
                    if 'row' in cls and 'result' in cls:
                        self.scrap_result_row(div)
                        # break
                sleep_scrapper('IndeedScraper')
            except Exception as exp:
                print '[IndeedScraper] :: run() :: Got exception : ' \
                      '%s and fetching data from url: %s' % (exp, url)

Exemple #7

0

Afficher le fichier

Fichier : overstock_scraper.py Projet : incomparable1142/Gaurav-scrapping

    def run(self):
        url = ''
        try:
            base_url = 'https://www.overstock.com/Home-Garden/%s/%s/' \
                       % (self.product_category, self.product_code)
            sufix = 'subcat.html?page='
            for j in range(1, 100, 1):
                url = base_url + sufix + str(j)
                print '[OverStockScraper] :: fetching data from url:', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[OverStockScraper] :: Failed to " \
                          "get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div', class_='product-tile'):
                    # print '---------div', div
                    self.scrap_result_row(div)
                    # break
                sleep_scrapper('OverStockScraper')
        except Exception as exp:
            print '[OverStockScraper] :: run() :: Got exception : ' \
                  '%s and fetching data from url: %s' % (exp, url)
            print(traceback.format_exc())

Exemple #8

0

Afficher le fichier

    def run(self):

        base_url = "https://www.yelp.com/search?find_desc=" \
                   "Dry+Cleaners&find_loc=New+York%2C+NY&start="

        for j in range(1, 1000, 10):
            try:
                url = base_url + str(j)
                print '[YelpScraper] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "Failed to get content of url: %s" % url
                    return

                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')

                li_class = "regular-search-result"

                # parsing html content to fet information about dry cleaners
                for li in soup.find_all('li', class_=li_class):
                    self.scrap_row_yelp(li)
                    # break  # just use it for testing only
                sleep_scrapper('YelpScraper')
            except Exception as exp:
                print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp
                print(traceback.format_exc())

Exemple #9

0

Afficher le fichier

Fichier : Inc42scrapper.py Projet : The-whole-solutions/Data-Scrappers

 def run(self):
     try:
         base_url = 'https://inc42.com/buzz/'
         r = requests.get(base_url, headers=get_request_headers())
         print('[IncScrapper] :: fetching data from TEAMS url: ', base_url)
         if not r.status_code == 200:
             print ("[IncScrapper] :: Failed to get " \
                 "content of url: %s" % base_url)
             return
         html_doc = r.content
         soup = BeautifulSoup(html_doc, 'html.parser')
         div1 = soup.find('div', class_="site-content")
         inc_news = div1.find_all("div",
                                  {"class": "card-wrapper horizontal-card"})
         for news in inc_news:
             self.scrape_home(news)
         sleep_scrapper('Inc42scrapper')
         #next pages data
         for i in range(2, 100, 1):
             page = 'page'
             url = base_url + page + str(i)
             r = requests.get(url, headers=get_request_headers())
             print('[IncScrapper] :: fetching data from TEAMS url: ', url)
             if not r.status_code == 200:
                 print ("[IncScrapper] :: Failed to get " \
                     "content of url: %s" % url)
                 return
             html_doc = r.content
             soup = BeautifulSoup(html_doc, 'html.parser')
             div1 = soup.find('div', class_="site-content")
             inc_news = div1.find_all(
                 "div", {"class": "card-wrapper horizontal-card"})
             for news in inc_news:
                 self.scrape_home(news)
             sleep_scrapper('Inc42scrapper')
     except Exception as exp:
         print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 Homepage: %s'\
             % exp)
         print(traceback.format_exc())

Exemple #10

0

Afficher le fichier

Fichier : yellowpages.py Projet : manisharmagarg/NewYork-Scrapper

    def scrap_yellowpages(self, url, scrap_type):

        logging.info("\nScrapping YellowPages Url: %s\n" % url)

        r = requests.get(url, headers=get_request_headers())

        if not r.status_code == 200:
            logging.error("Failed to get content of url: %s" % url)
            return

        html_doc = r.content
        soup = BeautifulSoup(html_doc, 'html.parser')
        div_class = "info"
        for div in soup.find_all('div', class_=div_class):
            try:
                self.scrap_row_yellowpages(div, scrap_type)
            except Exception as exp:
                logging.error("scrap_yellowpages() :: Got exception: %s" % exp)
                logging.error(traceback.format_exc())

Exemple #11

0

Afficher le fichier

    def scrap_groupon(self, url, scrap_type):
        logging.info("\n=======> Scrapping Groupon Url: %s \n" % url)

        r = requests.get(url, headers=get_request_headers())
        if not r.status_code == 200:
            logging.error(
                "scrap_groupon() :: Failed to get content of url: %s" % url)
            return

        html_doc = r.content
        soup = BeautifulSoup(html_doc, 'html.parser')

        div_class = "cui-content c-bdr-gray-clr ch-bdr-gray-md"

        # parsing html content to fet information about dry cleaners
        for div in soup.find_all('div', class_=div_class):
            try:
                self.scrap_row_groupon(div, scrap_type)
            except Exception as exp:
                logging.error("scrap_groupon() :: Got exception: %s" % exp)
                logging.error(traceback.format_exc())

Exemple #12

0

Afficher le fichier

    def scrap_yelp(self, url, scrap_type):
        logging.info("\nScrapping Yelp Url: %s \n" % url)

        r = requests.get(url, headers=get_request_headers())

        if not r.status_code == 200:
            logging.error("Failed to get content of url: %s" % url)
            return

        html_doc = r.content
        soup = BeautifulSoup(html_doc, 'html.parser')

        li_class = "regular-search-result"

        # parsing html content to fet information about dry cleaners
        for li in soup.find_all('li', class_=li_class):
            try:
                self.scrap_row_yelp(li, scrap_type)
            except Exception as exp:
                logging.error("scrap_yelp() :: Got exception: %s" % exp)
                logging.error(traceback.format_exc())

Exemple #13

0

Afficher le fichier

Fichier : python.py Projet : stoic1979/careermaker

    def scrap_python_developer(self, url):
        print "\nScrapping python Developer: %s \n" % url

        r = requests.get(url, headers=get_request_headers())

        if not r.status_code == 200:
            print "Failed to get content of url: %s" % url
            return
        html_doc = r.content

        soup = BeautifulSoup(html_doc, 'html.parser')

        # parsing html content  to fet information about python developer
        # for div in soup.find_all('div', class_='brdr'):
        for div in soup.find_all('div'):
            # ignore divs with classes
            if not div.attrs.has_key('class'):
                continue

            cls = div.attrs['class']
            if 'row' in cls and 'result' in cls:
                self.scrap_result_row(div)

Exemple #14

0

Afficher le fichier

Fichier : yellowPages_scraper.py Projet : The-whole-solutions/Data-Scrappers

    def run(self):
        base_url = 'https://www.yellowpages.com/search?search_terms=software+company&geo_location_terms=New+York%2C+NY&page='

        for j in range(27, 100, 1):
            try:
                url = base_url + str(j)
                print ('[YellowPagesScraper] :: fetching data from url: ', url)

                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print ('[YellowPagesScraper] :: Failed to get the content ' \
                          'of url: %s' % url)
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                for div in soup.find_all('div', class_='info'):
                    self.scrap_result_row(div)
                sleep_scrapper('YellowPagesScraper')
            except Exception as exp:
                print ('[YellowPagesScraper] :: run() :: Got exception: %s' % exp)
                print(traceback.format_exc())

Exemple #15

0

Afficher le fichier

    def run(self):

        base_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s,+NY&start=" % (
            self.product, self.location)

        for j in range(1, 1000, 10):
            try:
                url = base_url + str(j)
                print '[YelpScraper] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print '[YelpScraper] :: Failed to get content of url: %s' % url
                    return

                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')

                for li in soup.find_all('li', class_='regular-search-result'):
                    self.scrap_row_yelp(li)
                sleep_scrapper('YelpScraper')
            except Exception as exp:
                print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp
                print(traceback.format_exc())

Exemple #16

0

Afficher le fichier

Fichier : Samsclub.py Projet : Priyankadev/scrapit

    def run(self):

        for j in range(0, 2, 1):
            try:
                # url = base_url + str(j) + sufix
                url = 'https://www.samsclub.com/sams/coffee-tea-cocoa/1493.cp?xid=cat_sub&navAction=jump'
                print '[Samsclub] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[Samsclub] :: Failed to get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                # parsing html content  to fet information about python developer
                for div in soup.find_all('div', class_='products-card'):
                    self.scrap_result_row(div)
                sleep_scrapper('Samsclub')

            except Exception as exp:
                print '[Samsclub] :: run() :: Got exception : %s' % exp

Exemple #17

0

Afficher le fichier

    def run(self):
        try:

            url = 'https://news.google.com/news/headlines/section/topic' \
                  '/NATION.en_in/India?ned=in&hl=en-IN&gl=IN'

            print '[GoogleNewsScraper] :: fetching data from url: ', url
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print "[GoogleNewsScraper] :: Failed to get " \
                        "content of url: %s" % url
                return
            html_doc = r.content

            soup = BeautifulSoup(html_doc, 'html.parser')
            # print '------soup', soup
            for div in soup.find_all('div', class_='v4IxVd'):
                # print '-----div', div
                self.scrap_result_row(div)
            sleep_scrapper('GoogleNewsScraper')
        except Exception as exp:
            print '[GoogleNewsScraper] :: run() :: Got exception: %s'\
                  % exp
            print(traceback.format_exc())

Exemple #18

0

Afficher le fichier

Fichier : country_living_food-drinks_scrapper.py Projet : The-whole-solutions/Data-Scrappers

    def scrap_result_row(self, div33):

        try:
            

            #blog author name
            #blog_link
            div44=div33.find('a',{'class':'simple-item-image item-image'})
            link=div33.find('a',{'class':'simple-item-image item-image'})['href']
            #making blog folder starts here
            #blog_website
            prefix='https://www.countryliving.com'
            blog_link = prefix + link
            blog_name=link.split('/')
            file1=(blog_name[3])
            blog1_name=blog_name
            directory = file1
            parent_dir = '/home/soumya/Documents/scrappeddata/Food-Drink' #input("Enter the folder path where to store data: ") 
            blog_path = os.path.join(parent_dir,directory)
            os.mkdir(blog_path)         
            print('[AnyWebsiteScraper] :: blog_Folder has been created:',blog_path)
            path1=os.chdir(blog_path)

            #for blog_posting_date
            div66=div33.find('div',class_='simple-item-metadata')
            blog_date=div66.find('div',class_='publish-date simple-item-publish-date js-date').text.strip()
            date_string = blog_date
            # date_time_obj = datetime.datetime.strptime(date_time_str, '%b %d %Y %I:%M%p')
            print("date_string =", date_string)
            print("type of date_string =", type(date_string))
            date_object = datetime.datetime.strptime(date_string, "%b %d, %Y")
            print("date_object =", date_object)
            date = str(date_object)
            print("type of date_object =", type(date_object))
            datetime_obj = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
            datetime_obj_utc = datetime_obj.replace(tzinfo=timezone('UTC'))
            UTC_blog_date=datetime_obj_utc.strftime("%Y-%m-%d %H:%M:%S %Z%z")
            print('[AnyWebsiteScraper] :: blog_date:',UTC_blog_date)
            #blog_picture
            picture=div44.find('span')['data-lqip']
            picture1=picture.split('?')
            blog_picture=(picture1[0])
            blog_image_name=blog_picture.split('/').pop()
            print(blog_image_name)
            raw1_media=requests.get(blog_picture , stream=True)
            with open(blog_image_name,"wb") as f:
                f.write(raw1_media.content)
                print('IMAGE DOWNLOADED',f)
            print('current working directory',os.getcwd())
            path=os.getcwd()
            directory=blog_image_name
            path = os.path.join(path,directory)
            image_list=[]
            video_list=[]
            pdf_list=[]
            blog_author = '' 
            blog_subtitle = ''
            
                
            image_list.append(path)
            print('[AnyWebsiteBlogScraper] :: blog_description  :: BLOG_media:',path)
            #for blog_title
            blog_title=div33.find('a',class_='simple-item-title item-title').text.strip()
            print('[AnyWebsiteScraper] :: blog_title:',blog_title)
            #for blog_short_desc
            blog_short_desc=div33.find('div',class_='simple-item-dek item-dek').text.strip()
            blog_subtitle = blog_short_desc
            print('[AnyWebsiteScraper] :: blog_short_desc:',blog_subtitle)
            #Making blog folder ends here
            #recursively getting all blog data 
            print('[AnyWebsiteScraper] :: blog_link:',blog_link)
            try:
                url = blog_link
                print ('[AnyWebsiteScraper] :: fetching data from blog_link: ', url)
                r = requests.get(url, headers=get_request_headers())
                print(r,'rrrrrr')
                if not r.status_code == 200:
                    print ("[AnyWebsiteScraper] :: Failed to get " \
                        "content of url: %s" % url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                div111=soup.find('div',class_='site-content')
                #extratcion rule for normal blogs
                print('Normal blog:::::::.............')
                #blog_author name
                
                author=div111.find('div',class_='content-info-metadata')
                auth1=author.find('div',class_='byline-with-image')
                blog_author=auth1.find('div',class_='byline').text.strip()
                blog_author1 = (blog_author)
                print('[AnyWebsiteBlogScraper] :: blog_author:',blog_author1)
                div2222=div111.find('div',class_='content-container standard-container')
                if(div2222): 
                    blog_description=''
                    div333=div2222.find('div',class_='standard-body')
                    div8899=div333.find('div',class_='content-lede-image-wrap aspect-ratio-freeform')
                    div8889=div333.find('div',class_='article-body-content standard-body-content')
                    blog_description=div8889.find('p',class_='body-text').text.strip()
                    blog_desc = blog_description + '\n' + blog_author1
                    print('[AnyWebsiteBlogScraper] :: blog_description:', blog_desc.encode('utf-8'))
                else:
                    pass
                try:
                #extraction rule for slides container blogs
                    blog_description=''
                    div777=div111.find('div',class_='slideshow-outer')
                    div456=div777.find('div',class_='slideshow-lede active')
                    blog_description=div456.find('div',class_='slideshow-desktop-dek').text.strip()
                    blog_desc = blog_description + '\n' + blog_author1
                    print('[AnyWebsiteBlogScraper] :: blog_description:',blog_desc.encode('utf-8'))
                except:
                    pass
                if(div777):
                    print('Slides Blog:::::::::')
                    #self.Slide_blog(div777)
                
                
            except AttributeError as e:
                print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\
                  % e) 
                blog_author = ''
                url = blog_link
                print ('[AnyWebsiteScraper] :: fetching data from blog_link: ', url)
                r = requests.get(url, headers=get_request_headers())
                print(r,'rrrrrr')
                if not r.status_code == 200:
                    print ("[AnyWebsiteScraper] :: Failed to get " \
                        "content of url: %s" % url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                div111=soup.find('div',class_='site-content')
                    #extratcion rule for normal blogs
                print('Normal blog:::::::.............')
                    #blog_author name
                
            
                blog_author1 = (blog_author)
                print('[AnyWebsiteBlogScraper] :: blog_author:',blog_author1)
                div2222=div111.find('div',class_='content-container standard-container')
                if(div2222): 
                    blog_description=''
                    div333=div2222.find('div',class_='standard-body')
                    div8899=div333.find('div',class_='content-lede-image-wrap aspect-ratio-freeform')
                    div8889=div333.find('div',class_='article-body-content standard-body-content')
                    blog_description=div8889.find('p',class_='body-text').text.strip()
                    blog_desc = blog_description + '\n' + blog_author1
                    print('[AnyWebsiteBlogScraper] :: blog_description:', blog_desc.encode('utf-8'))
                else:
                    pass
                try:
                    #extraction rule for slides container blogs
                    blog_description=''
                    div777=div111.find('div',class_='slideshow-outer')
                    div456=div777.find('div',class_='slideshow-lede active')
                    blog_description=div456.find('div',class_='slideshow-desktop-dek').text.strip()
                    blog_desc = blog_description + '\n' + blog_author1
                    print('[AnyWebsiteBlogScraper] :: blog_description:',blog_desc.encode('utf-8'))
                except:
                    pass
                if(div777):
                    print('Slides Blog:::::::::')

     

                data = {
                    "blog_title":blog_title , 
                    "blog_sub_title":blog_subtitle,
                    "blog_author":blog_author,
                    "blog_date":UTC_blog_date,
                    "blog_url":blog_link,
                    "blog_text":blog_description,
                    "blog_storage_path":blog_path,
                    "blog_image_name":blog_image_name,
                    "blog_image_list":image_list,
                    "blog_video_list":video_list,
                    "blog_pdf_list":pdf_list
                   }
        
                with open('personal.json', 'w') as json_file:
                    print('json file is created')
                    json.dump(data, json_file)
                
            data = {
                "blog_title":blog_title , 
                "blog_sub_title":blog_subtitle,
                "blog_author":blog_author,
                "blog_date":UTC_blog_date,
                "blog_url":blog_link,
                "blog_text":blog_description,
                "blog_storage_path":blog_path,
                "blog_image_name":blog_image_name,
                "blog_image_list":image_list,
                "blog_video_list":video_list,
                "blog_pdf_list":pdf_list
                }
        
            with open('personal.json', 'w') as json_file:
                 print('json file is created')
                 json.dump(data, json_file)
            
        # except Exception as exp:
        #     print ('[AnyWebsiteScraper] :: scrap_result_row() :: ' \
        #           'Got exception : %s' % exp)
        #     print(traceback.format_exc())
        except UnicodeEncodeError as exp:
                print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\
                  % exp)
                blog_subtitle.encode('utf-8')
                
                url = blog_link
                print ('[AnyWebsiteScraper] :: fetching data from blog_link: ', url)
                r = requests.get(url, headers=get_request_headers())
                print(r,'rrrrrr')
                if not r.status_code == 200:
                    print ("[AnyWebsiteScraper] :: Failed to get " \
                         "content of url: %s" % url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                div111=soup.find('div',class_='site-content')
                    #extratcion rule for normal blogs
                print('Normal blog:::::::.............')
                    #blog_author name
                
                author=div111.find('div',class_='content-info-metadata')
                auth1=author.find('div',class_='byline-with-image')
                blog_author=auth1.find('div',class_='byline').text.strip()
                blog_author1 = (blog_author)
                print('[AnyWebsiteBlogScraper] :: blog_author:',blog_author1)
                div2222=div111.find('div',class_='content-container standard-container')
                if(div2222): 
                    
                    div333=div2222.find('div',class_='standard-body')
                    div8899=div333.find('div',class_='content-lede-image-wrap aspect-ratio-freeform')
                    div8889=div333.find('div',class_='article-body-content standard-body-content')
                    blog_description=div8889.find('p',class_='body-text').text.strip()
                    blog_desc = blog_description + '\n' + blog_author1
                    print('[AnyWebsiteBlogScraper] :: blog_description:', blog_desc.encode('utf-8'))
                else:
                    pass
                try:
                    #extraction rule for slides container blogs
                    
                    div777=div111.find('div',class_='slideshow-outer')
                    div456=div777.find('div',class_='slideshow-lede active')
                    blog_description=div456.find('div',class_='slideshow-desktop-dek').text.strip()
                    blog_desc = blog_description + '\n' + blog_author1
                    print('[AnyWebsiteBlogScraper] :: blog_description:',blog_desc.encode('utf-8'))
                except:
                    pass
                if(div777):
                    print('Slides Blog:::::::::')

     

                data = {
                "blog_title":blog_title , 
                "blog_sub_title":blog_subtitle,
                "blog_author":blog_author,
                "blog_date":UTC_blog_date,
                "blog_url":blog_link,
                "blog_text":blog_description,
                "blog_storage_path":blog_path,
                "blog_image_name":blog_image_name,
                "blog_image_list":image_list,
                "blog_video_list":video_list,
                "blog_pdf_list":pdf_list
                }
        
                with open('personal.json', 'w') as json_file:
                     print('json file is created')
                     json.dump(data, json_file)

Exemple #19

0

Afficher le fichier

Fichier : espnscrapper.py Projet : The-whole-solutions/Data-Scrappers

    def scrape_home_sections(self, section_heading_url):
        try:
            print('[ESPNscrapper] :: fetching data from section url :',
                  section_heading_url)
            r = requests.get(section_heading_url,
                             headers=get_request_headers())
            if not r.status_code == 200:
                print ("[ESPNScrapper] :: Failed to get " \
                        "content of url: %s" % section_heading_url)
                return
            html_doc = r.content
            soup = BeautifulSoup(html_doc, 'html.parser')
            col2_feed = soup.find('section', class_='col-two contentFeed')
            contentf = col2_feed.find_all("section", {"class": "contentItem"})
            for contentfeed in contentf:

                div_content = contentfeed.find_all(
                    'section',
                    class_=
                    'contentItem__content contentItem__content--story has-image has-video contentItem__content--collection'
                )
                if (div_content):

                    for post in div_content:
                        news_1 = post.find(
                            "a", {
                                "class":
                                "contentItem__padding contentItem__padding--border"
                            })
                        news_url = (news_1)['href']
                        print(news_url)

                        self.scrape_section_posts(news_url)
                else:
                    print('section has video NEWS only !!')
            contentf = col2_feed.find_all("article", {"class": "contentItem"})
            for contentfeed in contentf:

                div_content = contentfeed.find_all(
                    'section',
                    class_=
                    'contentItem__content contentItem__content--story has-image has-video contentItem__content--collection'
                )
                if (div_content):
                    print(
                        'now scrapping posts from articles in Home_Sections_scrapping'
                    )
                    for post in div_content:
                        news_1 = post.find(
                            "a", {
                                "class":
                                "contentItem__padding contentItem__padding--border"
                            })
                        news_url = (news_1)['href']
                        print(news_url)

                        self.scrape_section_posts(news_url)
                else:
                    print('section has video NEWS only !!')

        except AttributeError as e:
            print(
                'THIS Section is scraped already !!!.....Sorry, cant get data: %s'
            )
            print ('[ESPNscrapper] :: run() :: Got exception at fetching data from section url: %s'\
                     % e)
            print(traceback.format_exc())

        except TypeError as er:
            print('THIS Section is For Viewing Ads !!: %s')

            print ('[ESPNscrapper] :: run() :: Got exception at fetching data from section url: %s'\
                     % er)
            print(traceback.format_exc())

        except Exception as exp:
            print ('[ESPNscrapper] :: run() :: Got exception at fetching data from section url: %s'\
                     % exp)
            print(traceback.format_exc())

Exemple #20

0

Afficher le fichier

Fichier : espnscrapper.py Projet : The-whole-solutions/Data-Scrappers

    def scrape_section_posts(self, news_url):
        try:
            if not news_url.startswith('http'):
                url = 'https://www.espn.in/cricket/'
                _url = url.split('/')
                _1url = _url[0]
                _2url = _url[2]
                _3url = '//'
                base_url = _1url + _3url + _2url
                post_url = base_url + news_url
                news_url = post_url
            print('home secton post url')
            print('[ESPNHOMEScrapper] :: fetching data from HOME POST url: ',
                  news_url)

            r = requests.get(news_url, headers=get_request_headers())
            if not r.status_code == 200:
                print ("[ESPNHOMEScrapper] ::Posts URL Failed to get " \
                    "content of post_url: %s" % news_url)
                return
            html_doc = r.content
            soup = BeautifulSoup(html_doc, 'html.parser')
            sect1 = soup.find('section', id='pane-main')
            sect11 = sect1.find('section', id='main-container')
            sect111 = sect11.find("div", {"class": "main-content"})
            sect2 = sect111.find('section', id='article-feed')
            div555 = sect2.find('div', class_='container')
            article_header = div555.find('header',
                                         class_='article-header').text.strip()
            art__ = div555.find('div', class_='article-body')
            art_ = art__.find('div', class_='article-meta')
            POST_DATE = art_.find('span', class_='timestamp')
            POST_UTC_DATE = (POST_DATE)['data-date']
            print(
                '[ESPNScrapper] :: fetching data from Post :: UTC TIMESTAMP : ',
                POST_UTC_DATE)
            utc_date = POST_UTC_DATE.split('T')
            utc_date1 = utc_date[1].split(':')
            print('utc_date1', utc_date1)
            fold = utc_date1[0] + ' ' + utc_date1[1] + ' ' + utc_date1[2]
            utc = utc_date[0] + ' ' + fold
            print('folder name', utc)
            directory = utc
            parent_dir = 'C:/Users/lenovo/Desktop/scholarsbook_scrappers_data/ESPN.in/cricket/'  #input("Enter the folder path where to store data: ")
            blog_path = os.path.join(parent_dir, directory)
            os.mkdir(blog_path)
            print('[AnyWebsiteScraper] :: blog_Folder has been created:',
                  blog_path)
            path1 = os.chdir(blog_path)
            print('currentpath', os.getcwd())
            print('[ESPNScrapper] :: fetching data from Post :: header : ',
                  article_header)
            img1 = div555.find('figure', class_='article-figure dim16x9')
            pic__ = ''
            if (img1):

                img2 = img1.find('div', class_='img-wrap')
                pic__ = img2.find('source')['srcset']
                pic_ = pic__.split('?').pop()
                pic11 = pic_.split('2F').pop()
                pic111 = pic11.split('&')
                image_name = (pic111[0])
                print(
                    '[ESPNScrapper] :: fetching data from Post :: Image_URL : ',
                    pic__)
                raw1_media = requests.get(pic__, stream=True)
                with open(image_name, "wb") as f:
                    f.write(raw1_media.content)
                    print('IMAGE DOWNLOADED', f)
            else:
                print('IT a video post.......')
                aside = art__.find('aside', class_='inline inline-photo full')
                if (aside):
                    aside_ = aside.find('figure')
                    videoPostImage = aside_.find('source')['data-srcset']
                    raw1_media = requests.get(videoPostImage, stream=True)
                    with open(image_name, "wb") as f:
                        f.write(raw1_media.content)
                        print('IMAGE DOWNLOADED', f)
                    print(
                        '[ESPNScrapper] :: fetching data from Post :: Image_URL : ',
                        videoPostImage)
                else:
                    print(
                        'This post is a live report . Sorry not HAVE any image'
                    )

            POST_LOCAL_TIME = (POST_DATE).text.strip()
            print(
                '[ESPNScrapper] :: fetching data from Post :: POST_LOCAL_TIME : ',
                POST_LOCAL_TIME)
            post_author = art_.find('ul', class_='authors').text.strip()
            print(
                '[ESPNScrapper] :: fetching data from Post :: POST AUTHOR : ',
                post_author)
            po = (art__).text.strip()
            print('[ESPNScrapper] :: fetching data from Post :: POST TEXT : ',
                  po)
            self.parse_data_to_json(news_url, article_header, pic__,
                                    POST_UTC_DATE, POST_LOCAL_TIME,
                                    post_author, po)
        except Exception as exp:
            print ('[POSTSCRAPPER] :: run() :: Got exception at HOME NEWS_url: %s'\
            % exp)
            print(traceback.format_exc())

Exemple #21

0

Afficher le fichier

Fichier : Inc42scrapper.py Projet : The-whole-solutions/Data-Scrappers

    def scrape_home(self, news):
        try:
            news_image = news.find('figure', class_='card-image')
            news__ = news_image.find('a')
            news_ = (news__['href'])
            news_l = news_.split('?')
            news_link = news_l[0]
            news_lin = news_link.split('/')
            folder_name = news_lin[4]
            directory = folder_name
            parent_dir = 'C:/Users/lenovo/Desktop/scholarsbook_scrappers_data/Inc42/'  #input("Enter the folder path where to store data: ")
            blog_path = os.path.join(parent_dir, directory)

            os.mkdir(blog_path)
            print('[AnyWebsiteScraper] :: blog_Folder has been created:',
                  blog_path)
            os.chdir(blog_path)
            #print('currentpath',os.getcwd())
            print('[Inc42Scrapper]::::::::::: NEWS_Link', news_link)
            img = news__.find('img')['src']
            img_ = img.split('?')
            News_Image_url = img_[0]
            print('[Inc42Scrapper]::::::::::: NEWS_Image_URL', News_Image_url)
            News_Image = News_Image_url.split('/').pop()
            raw1_media = requests.get(News_Image_url, stream=True)
            with open(News_Image, "wb") as f:
                f.write(raw1_media.content)
                print('IMAGE DOWNLOADED', f)
            news_title_ = news.find('div', class_='card-content')
            news_title = news_title_.find('h2',
                                          class_='entry-title').text.strip()
            print('[Inc42Scrapper]::::::::::: NEWS_Title', news_title)

            news_url = news_link
            #parsing the news-link
            try:
                r = requests.get(news_url, headers=get_request_headers())
                print('[Inc42Scrapper] :: fetching data from TEAMS url: ',
                      news_url)
                if not r.status_code == 200:
                    print ("[Inc42Scrapper] :: Failed to get " \
                        "content of url: %s" % news_url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                div1_ = soup.find('div', class_='site-content')
                div5_ = div1_.find(
                    'div',
                    class_=
                    'meta-wrapper single-meta-wrapper single-meta-wrapper-top entry-meta clearfix'
                )
                div7_ = div5_.find(
                    'div',
                    class_='post-meta large-7 medium-6 small-12 columns')
                #print(div7_)
                news_author = div7_.find(
                    'div', class_='author-name large').text.strip()
                #news-author
                print('[Inc42Scrapper]::::::::::: NEWS_Author_name',
                      news_author)
                news_date = div5_.find('div', class_='date').text.strip()
                #news_date
                print('[Inc42Scrapper]::::::::::: NEWS_date', news_date)
                #news-subtitle
                date_object = datetime.datetime.strptime(
                    news_date, "%b %d, %Y")
                #print("date_object =", date_object)
                date = str(date_object)
                datetime_obj = datetime.datetime.strptime(
                    date, "%Y-%m-%d %H:%M:%S")
                print(datetime_obj)
                datetime_obj_utc = datetime_obj.replace(tzinfo=timezone('UTC'))
                UTC_blog_date = datetime_obj_utc.strftime(
                    "%Y-%m-%d %H:%M:%S %Z%z")

                print('BlOG________________DATE', UTC_blog_date)
                news_subtitles = div1_.find(
                    'div', class_='single-post-summary').text.strip()
                print('[Inc42Scrapper]::::::::::: NEWS_subtitles',
                      news_subtitles)
                #news-description
                news_text = div1_.find(
                    'div', class_='entry-content clearfix').text.strip()
                print('[Inc42Scrapper]::::::::::: NEWS_Description', news_text)

            except AttributeError:
                print('this post dont have author')
                news_author = ''
                print('[Inc42Scrapper]::::::::::: NEWS_author', news_date)
                #news-subtitle
                news_subtitles = div1_.find(
                    'div', class_='single-post-summary').text.strip()
                print('[Inc42Scrapper]::::::::::: NEWS_subtitles',
                      news_subtitles)
                #news-description
                news_text = div1_.find(
                    'div', class_='entry-content clearfix').text.strip()
                print('[Inc42Scrapper]::::::::::: NEWS_Description', news_text)

            except Exception as exp:
                print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 post url: %s'\
                    % exp)
                print(traceback.format_exc())

            self.parse_to_json(news_link, news_title, news_subtitles,
                               News_Image_url, news_author, news_date,
                               blog_path, news_text)

        except Exception as exp:
            print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 Homepage: %s'\
                % exp)
            print(traceback.format_exc())

Exemple #22

0

Afficher le fichier

Fichier : espnscrapper.py Projet : The-whole-solutions/Data-Scrappers

    def run(self):
        try:
            #Teams news scrapping 'https://www.espn.in/cricket/team/_/id/1/england',
            #   'https://www.espn.in/cricket/team/_/id/2/australia',
            #   'https://www.espn.in/cricket/team/_/id/3/south-africa',
            #   'https://www.espn.in/cricket/team/_/id/4/west-indies',
            #   'https://www.espn.in/cricket/team/_/id/5/new-zealand',
            #   'https://www.espn.in/cricket/team/_/id/6/india',
            #   'https://www.espn.in/cricket/team/_/id/7/pakistan',
            #   'https://www.espn.in/cricket/team/_/id/8/sri-lanka',
            #   'https://www.espn.in/cricket/team/_/id/9/zimbabwe',
            #   'https://www.espn.in/cricket/team/_/id/25/bangladesh/',
            cricketTeams_URLs = [
                'http://www.espn.in/cricket/team/_/id/1/england',
                'http://www.espn.in/cricket/team/_/id/2/australia',
                'http://www.espn.in/cricket/team/_/id/3/south-africa',
                'http://www.espn.in/cricket/team/_/id/4/west-indies',
                'http://www.espn.in/cricket/team/_/id/5/new-zealand',
                'http://www.espn.in/cricket/team/_/id/6/india',
                'http://www.espn.in/cricket/team/_/id/7/pakistan',
                'http://www.espn.in/cricket/team/_/id/8/sri-lanka',
                'http://www.espn.in/cricket/team/_/id/9/zimbabwe',
                'http://www.espn.in/cricket/team/_/id/25/bangladesh/'
            ]

            for url in cricketTeams_URLs:
                print("PART_________________________________A")
                print('[ESPNScrapper] :: fetching data from TEAMS url: ', url)
                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print ("[ESPNScrapper] :: Failed to get " \
                        "content of url: %s" % url)
                    return
                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                sect1 = soup.find('section', id='pane-main')
                sect11 = sect1.find('section', id='main-container')
                sect2 = sect11.find('section', id='news-feed')
                news_feed_list = sect2.find('div', class_='container')
                section_news = news_feed_list.find_all(
                    "article",
                    {"class": "news-feed-item news-feed-story-package"})
                Headlines = news_feed_list.find_all("article", {
                    "class":
                    "news-feed-item news-feed-story-package is-headline"
                })
                team_parsed_urls = []
                for news in Headlines:
                    print("PART______________________________A.A")
                    news_ = news.find("a", {"class": "story-link"})
                    if (news):
                        news_url = (news_)['data-popup-href']
                        #self.scrape_post_content(news_url)
                        team_parsed_urls.append(news_url)

                for news in section_news:
                    print("PART______________________________A.B")
                    print(
                        'You are now scrapping regular news fro webpage !!.. ')
                    news_ = news.find("a", {"class": "story-link"})
                    if (news):
                        news_url = (news_)['data-popup-href']
                        print('[ESPNScrapper] :: section news URL: ', news_url)
                        #self.scrape_post_content(news_url)
                        team_parsed_urls.append(news_url)

                infnite_scroll_url = 'https://secure.espn.com/core/minifeed?render=true&partial=newsfeed&limit=20&xhr=1&template=clubhouse&headlinestack=true&site=espn&lang=en&region=in&sport=cricket&pubkey=cricket-clubhouse&insider=false&device=desktop&country=in&lang=en&region=in&site=espn&edition-host=espn.in&site-type=full&userab=0&offset='
                ur_l = url.split('/')
                print('url to be scrolled infinite', ur_l)
                team = '&team=' + ur_l[7]
                for i in range(0, 100, 25):
                    scroll_url = infnite_scroll_url + str(i) + team
                    print(
                        "PART____________AUTO-SCROLL______________________A.C")
                    print(
                        '[ESPNScrapper] :: fetching data from infinite-url: ',
                        scroll_url)
                    try:
                        raw_json = requests.get(scroll_url).text
                        data = json.loads(raw_json)
                        qw = (data['content']['html']['items'][0]['html'])
                        for data in data['content']['html']['items']:
                            qw = data['html']
                            try:

                                qwe = json.dumps(qw)
                                soup = BeautifulSoup(qwe, 'html.parser')
                                section_ = soup.find("a")['data-popup-href']
                                print(section_)
                                sect = section_.replace('\\"', '')
                                if re.search('clip', sect):
                                    print(
                                        "NEWS only contains video, no text , no image , so skipping this News"
                                    )
                                else:
                                    team_parsed_urls.append(sect)
                                    self.scrape_post_content(sect)

                            except Exception as exp:
                                print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                                    % exp)
                                print(traceback.format_exc())

                    except Exception as exp:
                        print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                            % exp)
                        print(traceback.format_exc())
                    sleep_scrapper('ESPNScrapper')
                sleep_scrapper('ESPNScrapper')

                base_url = 'https://www.espn.in/cricket/'
                parent_fol = base_url.split('/')
                print(parent_fol)
                print("PART_______________________________B")
                print('[ESPNScrapper] :: fetching data from BASE Url: ',
                      base_url)
                ree = requests.get(base_url, headers=get_request_headers())
                if not ree.status_code == 200:
                    print ("[ESPNScrapper] :: Failed to get " \
                        "content of url: %s" % base_url)
                    return
                html_doc = ree.content
                soup = BeautifulSoup(html_doc, 'html.parser')
                col2_feed = soup.find('section', class_='col-two contentFeed')
                contentf = col2_feed.find_all("section",
                                              {"class": "contentItem"})
                for content in contentf:
                    print('contentfeedsections')
                    self.scrape_sports(content, cricketTeams_URLs)
                contentf = col2_feed.find_all("article",
                                              {"class": "contentItem"})
                for contentfeed in contentf:
                    print('contentfeedARTICLES')
                    self.scrape_sports(contentfeed, cricketTeams_URLs)
                    #scrape only 26 posts of the starting webpage

                #Infinite loading URL
                infinite_url = 'https://onefeed.fan.api.espn.com/apis/v3/cached/contentEngine/oneFeed/leagues/cricket?source=ESPN.com%2B-%2BFAM&showfc=true&region=in&limit=15&lang=en&authorizedNetworks=espn_free&editionKey=espnin-en&device=desktop&pubkey=espncricinfo-en-in-cricket-index&isPremium=true&locale=in&featureFlags=expandAthlete&featureFlags=mmaGB&offset='
                #for 10 times scrolling
                for i in range(10, 100, 15):
                    scroll_url = infinite_url + str(i)
                    print(
                        '[ESPNScrapper] :: fetching data from infinite-url: ',
                        scroll_url)

                    r = requests.get(scroll_url, headers=get_request_headers())
                    try:
                        raw_json = requests.get(scroll_url).text
                        dataa = json.loads(raw_json)
                        for data in dataa['feed']:
                            qw = (data['data']['now'][0])
                            try:
                                keys = sorted(qw.items())
                                result = [(key, value) for key, value in keys
                                          if key.startswith("links")]
                                result11 = result[0]
                                result33 = list(result11)
                                reo = result33[1]
                                scroll_news_url = reo['web']['href']
                                print('FETCHING post from scroll URL')
                                self.scrape_post_content(scroll_news_url)

                            except Exception as exp:
                                print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                                    % exp)
                                print(traceback.format_exc())

                    except Exception as exp:
                        print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                            % exp)
                        print(traceback.format_exc())

                sleep_scrapper('ESPNScrapper')

        except Exception as exp:
            print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\
                  % exp)
            print(traceback.format_exc())