def __init__(self, product_category, product_subcategory, product_title,
              product_code):
     self.mdb = Mdb()
     self.product_category = product_category
     self.product_subcategory = product_subcategory
     self.product_title = product_title
     self.product_code = product_code
class OverStockScraper:

    def __init__(self, product_category, product_code):
        self.mdb = Mdb()
        self.product_category = product_category
        self.product_code = product_code

    def run(self):
        url = ''
        try:
            base_url = 'https://www.overstock.com/Home-Garden/%s/%s/' \
                       % (self.product_category, self.product_code)
            sufix = 'subcat.html?page='
            for j in range(1, 100, 1):
                url = base_url + sufix + str(j)
                print '[OverStockScraper] :: fetching data from url:', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[OverStockScraper] :: Failed to " \
                          "get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div', class_='product-tile'):
                    # print '---------div', div
                    self.scrap_result_row(div)
                    # break
                sleep_scrapper('OverStockScraper')
        except Exception as exp:
            print '[OverStockScraper] :: run() :: Got exception : ' \
                  '%s and fetching data from url: %s' % (exp, url)
            print(traceback.format_exc())

    def scrap_result_row(self, div):
        try:
            div = div.find('div', class_='product-info')
            sub_div = div.find('div', class_='product-price-wrapper')
            price = sub_div.find('div', class_='product-price-container')\
                .text.strip()
            print '[OverStockScraper] :: price: ', price
            title = div.find('div', class_='product-title').text.strip()
            print '[OverStockScraper] :: title: ', title
            rating = div.find('div', class_='product-footer')
            print '[OverStockScraper] :: rating: ', rating

            self.mdb.overstock_scraper_data(price, title, rating)

            fname = 'data_over_stock.csv'
            msg = "%s, %s, %s," % (price, title, rating)
            print "[OverStockScraper] :: scrap_result_row() :: msg:", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[OverStockScraper] :: scrap_result_row() :: ' \
                  'Got exception: %s' % exp
            print(traceback.format_exc())
Esempio n. 3
0
class GoogleNewsScraper:
    def __init__(self):
        self.mdb = Mdb()

    def run(self):
        try:

            url = 'https://news.google.com/news/headlines/section/topic' \
                  '/NATION.en_in/India?ned=in&hl=en-IN&gl=IN'

            print '[GoogleNewsScraper] :: fetching data from url: ', url
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print "[GoogleNewsScraper] :: Failed to get " \
                        "content of url: %s" % url
                return
            html_doc = r.content

            soup = BeautifulSoup(html_doc, 'html.parser')
            # print '------soup', soup
            for div in soup.find_all('div', class_='v4IxVd'):
                # print '-----div', div
                self.scrap_result_row(div)
            sleep_scrapper('GoogleNewsScraper')
        except Exception as exp:
            print '[GoogleNewsScraper] :: run() :: Got exception: %s'\
                  % exp
            print(traceback.format_exc())

    def scrap_result_row(self, div):

        try:
            c_wiz = div.find('c-wiz', class_='M1Uqc kWyHVd')
            headlines = c_wiz.find('a', class_='nuEeue hzdq5d ME7ew')\
                .text.strip()
            print '[GoogleNewsScraper] :: HeadLines: ', headlines
            div = div.find('div', class_='alVsqf')
            sub = div.find('div', class_='jJzAOb')
            c_wiz = sub.find('c-wiz', class_='M1Uqc MLSuAf')
            a = c_wiz.find('a', class_='nuEeue hzdq5d ME7ew').text.strip()
            print '[GoogleNewsScraper] :: SubheadLines: ', a

            # save in data base
            self.mdb.google_news_data(headlines, a)

            fname = 'data_google_news.csv'
            msg = "%s, %s" % (headlines, a)
            print "[GoogleNewsScraper] :: scrap_result_row() :: msg:", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[GoogleNewsScraper] :: scrap_result_row() :: ' \
                  'Got exception : %s' % exp
            print(traceback.format_exc())
 def __init__(self, product):
     self.product = product
     self.mdb = Mdb()
Esempio n. 5
0
from flask import Flask, request, render_template, jsonify, session, url_for
from functools import wraps
# from flask.ext.bcrypt import Bcrypt
# from wtforms.fields import SelectField
from db import Mdb
# import jwt
# import datetime
import json
import traceback

app = Flask(__name__)
# bcrypt = Bcrypt(app)
mdb = Mdb()

app.config['secretkey'] = 'some-strong+secret#key'
app.secret_key = 'F12Zr47j\3yX R~X@H!jmM]Lwf/,?KT'


def sumSessionCounter():
    try:
        session['counter'] += 1
    except KeyError:
        session['counter'] = 1


def token_required(f):
    @wraps(f)
    def decorated(*args, **kwargs):
        token = request.args.get('token')

        if not token:
    def __init__(self, domain, pos, location):

        self.domain = domain.replace(" ", "+")
        self.post = pos.replace(" ", "+")
        self.location = location.replace(" ", "+")
        self.mdb = Mdb()
class FlipkartScraper:
    def __init__(self, product):
        self.product = product
        self.mdb = Mdb()

    def run(self):

        try:
            base_url = 'https://www.flipkart.com/search?as=off&as-show=' \
                       'on&otracker=start&page='
            sufix = '&q=%s&viewType=list' % self.product

            for i in range(1, 100, 1):
                url = base_url + str(i) + sufix
                print '[FlipkartScraper] :: fetching data from url: ', url

                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print '[FlipkartScraper] :: Failed to get the content ' \
                          'of url: %s' % url
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                # for div in soup.find_all('div', class_='col col-7-12'):
                for div in soup.find_all('div', class_='_1-2Iqu row'):
                    # print '---------------------div', div
                    self.scrap_result_row(div)
                sleep_scrapper('FlipkartScraper')
        except Exception as exp:
            print '[FlipkartScraper] :: run() :: Got exception: %s' % exp
            print(traceback.format_exc())

    def scrap_result_row(self, div):

        try:
            Product_div = div.find('div', class_='col col-7-12')
            title = Product_div.find('div', class_='_3wU53n').text.strip()
            print '[FlipkartScraper] :: title . . . . ..:', title
            # title_description = div.find('div', class_='OiPjke').text.strip()
            # print'[FlipkartScraper] :: title_description: ', title_description

            rating = div.find('div', class_='niH0FQ')
            sub_rating = rating.find('span', class_='_38sUEc').text.strip()
            print '[FlipkartScraper] :: rating . . . . .:', sub_rating

            specifications_div = div.find('div', class_='_3ULzGw')
            specifications = specifications_div.find(
                'ul', class_='vFw0gD').text.strip()
            print '[FlipkartScraper] :: specifications .: ', specifications

            product_price = div.find('div', class_='_6BWGkk')
            div_price = product_price.find('div', class_='_1uv9Cb')
            price = div_price.find('div',
                                   class_='_1vC4OE _2rQ-NK').text.strip()
            print '[FlipkartScraper] :: price . . . . . :', price

            self.mdb.flipkart_scraper_data(title, sub_rating, specifications,
                                           price)

            fname = 'data_flipkart.csv'
            msg = "%s, %s, %s, %s," % (title, sub_rating, specifications,
                                       price)
            print "[FlipkartScraper] :: scrap_result_row() :: msg:", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[FlipkartScraper] :: scrap_result_row() :: ' \
                  'Got exception: %s' % exp
            print(traceback.format_exc())
 def __init__(self, keyword):
     self.keyword = keyword
     self.mdb = Mdb()
class IndeedScrapper:
    def __init__(self, domain, pos, location):

        self.domain = domain.replace(" ", "+")
        self.post = pos.replace(" ", "+")
        self.location = location.replace(" ", "+")
        self.mdb = Mdb()

    def run(self):

        base_url = 'https://www.indeed.co%s/jobs?q=%s&l=%s&start=' % (
            self.domain, self.post, self.location)
        for j in range(0, 1000, 10):
            url = ''
            try:
                url = base_url + str(j)
                print '[IndeedScrapper] :: fetching data from url:', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print "[IndeedScrapper] :: Failed to " \
                          "get content of url: %s" % url
                    return

                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                # print '----------soup', soup
                for div in soup.find_all('div'):
                    # ignore divs with classes
                    if not div.attrs.has_key('class'):
                        continue

                    cls = div.attrs['class']
                    if 'row' in cls and 'result' in cls:
                        self.scrap_result_row(div)
                sleep_scrapper('IndeedScraper')
            except Exception as exp:
                print '[IndeedScraper] :: run() :: Got exception : ' \
                      '%s and fetching data from url: %s' % (exp, url)

    def scrap_result_row(self, div):

        try:
            # title
            title = div.find('span', class_='company').text.strip()
            print "[IndeedScrapper] :: title: %s" % title

            # location
            span = div.find('span', class_='location')
            location = span.text.strip()
            print "[IndeedScrapper] :: location: %s" % location

            # salary
            sal = ''
            span = div.find('span', class_='no-wrap')
            if span:
                sal = span.text.strip()
                print "[IndeedScrapper] :: salary: %s" % sal

            # summary
            span = div.find('span', class_='summary')
            summary = span.text.strip()
            print "[IndeedScrapper] :: summery: %s" % summary

            self.mdb.indeed_scraper_data(title, location, sal, summary)

            fname = 'data_indeed.csv'
            msg = "%s, %s, %s, %s," % (title, location, sal, summary)
            print "[IndeedScrapper] :: scrap_result_row() :: msg:", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[IndeedScrapper] :: scrap_result_row() :: ' \
                  'Got exception : %s' % exp
class HomeDepotScraper:

    def __init__(self, product):
        self.mdb = Mdb()
        self.product = product.replace(" ", "-")

    def run(self):

        base_url = 'https://www.homedepot.com/b/' \
                   '%s/N-5yc1vZbm79?Nao=' % (self.product)
        sufix = '&Ns=None'

        for j in range(0, 1000, 12):
            url = ''
            try:
                url = base_url + str(j) + sufix
                print '[HomeDepotScraper] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print "[HomeDepotScraper] :: Failed to get " \
                          "content of url: %s" % url
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')

                for div in soup.find_all('div', class_='pod-inner'):
                    self.scrap_result_row(div)
                sleep_scrapper('HomeDepotScraper')
            except Exception as exp:
                print '[HomeDepotScraper] :: run() :: Got exception : ' \
                      '%s and fetching data from url: %s' % (exp, url)

    def scrap_result_row(self, div):

        try:
            # # name
            # name = div.find('div', class_='pod-plp__description js-
            # podclick-analytics')
            # a = name.find('a').strip()
            # print '[HomeDepotScraper] :: name: ', a

            # model
            model = div.find('div', class_='pod-plp__model').text.strip()
            print '[HomeDepotScraper] :: model: ', model

            # price
            price = div.find('div', class_='price').text.strip()
            print '[HomeDepotScraper] :: price: ', price

            # stock
            stock = div.find('div', class_='pod-plp__shipping-message__'
                                           'wrapper-boss-bopis').text.strip()
            print '[HomeDepotScraper] :: stock: ', stock

            self.mdb.homedepot_data(model, price, stock)

            fname = 'data_home_depot.csv'
            msg = "%s, %s, %s," % (model, price, stock)
            print "[HomeDepotScraper] :: scrap_result_row() :: msg:", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[HomeDepotScraper] :: scrap_result_row() :: ' \
                  'Got exception : %s' % exp
class GoogleSearchListingsScraper:
    def __init__(self, keyword):
        self.keyword = keyword
        self.mdb = Mdb()

    def run(self):
        try:

            options = Options()
            options.add_argument("window-size=1400,600")
            ua = UserAgent()
            a = ua.random
            user_agent = ua.random

            options.add_argument(f'user-agent={user_agent}')
            driver = webdriver.Chrome(
                "C:/Users/Dell/Downloads/chromedriver_win32/chromedriver.exe",
                options=options)
            html = driver.page_source

            for i in range(00, 60, 10):
                suffix = '&q=%s' % self.keyword
                url = 'https://www.google.com/search?client=firefox-b-d&biw=1366&bih=654&sa=N&ved=0ahUKEwjfy8ugnZHkAhVLro8KHXq1Ar0Q8tMDCJMC&ei=zslbXd-sHcvcvgT66oroCw&start=' + str(
                    i) + suffix
                driver.get(url)
                html = driver.page_source

                # scrap_data(website_link,website_title,website_snippet)
                soup = BeautifulSoup(html, 'html.parser')
                print('...........soupppppppppp', soup.encode('utf-8'))
                for div in soup.find_all('div', class_='g'):
                    print('---------------------div', div)
                    self.scrap_result_row(div)
                time.sleep(15)
                sleep_scrapper('GoogleSearchListingsScraper')

        except Exception as exp:
            print(
                '[GoogleSearchListingsScraper] :: run() :: Got exception: %s' %
                exp)
            print(traceback.format_exc())

    def scrape_result_now(self, div):
        try:
            website_link = []
            website_snippet = []
            website_title = []
            results = div.find("div", {"class": ""})
            for result in results:
                website_link.append(
                    result.find("div", {
                        "class": "title-bar-left"
                    }).get_text().strip())
                print('[GoogleSearchListingsScraper] :: address . . . . ..:',
                      website_link)
                website_title.append(
                    result.find("span", {"result-adress"}).get_text().strip())
                print('[GoogleSearchListingsScraper] :: title . . . . ..:',
                      website_title)
                website_snippet.append(
                    result.find("div", {
                        "class": "xl-desc"
                    }).get_text().strip())
                print(
                    '[GoogleSearchListingsScraper] :: description . . . . ..:',
                    website_snippet)

                self.mdb.google_listings_by_search_scrapper_data(
                    website_link, website_title, website_snippet)

                #header row of csv file defined here
                df = pd.DataFrame({
                    "Title": website_title,
                    "Address": website_link,
                    "Description": website_snippet
                })
                df.to_csv("output.csv")
        except Exception as exp:
            print(
                '[GoogleSearchListingsScraper] :: run() :: Got exception: %s' %
                exp)
            print(traceback.format_exc())
 def __init__(self, product):
     self.mdb = Mdb()
     self.product = product.replace(" ", "-")
Esempio n. 13
0
class Python:
    def __init__(self):
        self.mdb = Mdb()

    def scrap_result_row(self, div):

        ################################
        #       Company title          #
        ################################
        title = div.find('span', class_='company').text.strip()
        print "company title: %s" % title

        ################################
        #       Company location       #
        ################################
        span = div.find('span', class_='location')
        location = span.text.strip()

        print "company Location: %s" % location


        ################################
        #       Company salary         #
        ################################
        salary = ''
        span = div.find('span', class_='no-wrap')
        if span:
            salary = span.text.strip()
            print "Salary: %s" % salary
        else:
            print "salary: %s" % span

        ################################
        #       Company summary        #
        ################################
        span = div.find('span', class_='summary')
        summary = span.text.strip()

        print "Summery: %s" % summary

        self.mdb.add_vacancy_python(title, location, salary, summary)

    def scrap_python_developer(self, url):
        print "\nScrapping python Developer: %s \n" % url

        r = requests.get(url, headers=get_request_headers())

        if not r.status_code == 200:
            print "Failed to get content of url: %s" % url
            return
        html_doc = r.content

        soup = BeautifulSoup(html_doc, 'html.parser')

        # parsing html content  to fet information about python developer
        # for div in soup.find_all('div', class_='brdr'):
        for div in soup.find_all('div'):
            # ignore divs with classes
            if not div.attrs.has_key('class'):
                continue

            cls = div.attrs['class']
            if 'row' in cls and 'result' in cls:
                self.scrap_result_row(div)
class BedBathAndBeyondScraper:
    def __init__(self, product_category, product_subcategory, product_title,
                 product_code):
        self.mdb = Mdb()
        self.product_category = product_category
        self.product_subcategory = product_subcategory
        self.product_title = product_title
        self.product_code = product_code

    def run(self):
        try:

            url = 'https://www.bedbathandbeyond.com/store/category' \
                  '/%s/%s/%s/%s/' \
                  % (self.product_category, self.product_subcategory,
                     self.product_title, self.product_code)

            print '[BedBathAndBeyondScraper] :: fetching data from url: ', url
            r = requests.get(url, headers=get_request_headers())
            if not r.status_code == 200:
                print "[BedBathAndBeyondScraper] :: Failed to get " \
                        "content of url: %s" % url
                return
            html_doc = r.content

            soup = BeautifulSoup(html_doc, 'html.parser')

            for div in soup.find_all('div',
                                     class_='productCo'
                                     'ntent ec_listing'):
                self.scrap_result_row(div)
            sleep_scrapper('BedBathAndBeyondScraper')
        except Exception as exp:
            print '[BedBathAndBeyondScraper] :: run() :: Got exception: %s'\
                  % exp
            print(traceback.format_exc())

    def scrap_result_row(self, div):

        try:
            div = div.find('div', class_='prodInfo')
            sub_div = div.find('div', class_='prodName')
            a = sub_div.find('a')
            print '[BedBathAndBeyondScraper] :: title: ', a.text.strip()
            div = div.find('div', class_='prodPrice')
            sub_div = div.find('div', class_='priceOfProduct')
            sub = sub_div.find('div', class_='isPrice')
            print '[BedBathAndBeyondScraper] :: price: ', sub.text.strip()

            self.mdb.bedbathandbeyond_scraper_data(a.text.strip(),
                                                   sub.text.strip())

            fname = 'data_bed_bath_and_beyond.csv'
            msg = "%s, %s," % (a.text.strip(), sub.text.strip())
            print "[BedBathAndBeyondScraper] :: scrap_result_row() :: " \
                  "msg:", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[BedBathAndBeyondScraper] :: scrap_result_row() :: ' \
                  'Got exception : %s' % exp
            print(traceback.format_exc())
class YellowPagesScraper:
    def __init__(self):
        self.mdb = Mdb()

    def run(self):
        base_url = 'https://www.yellowpages.com/search?search_terms=' \
                   'Dry%20Cleaners%20%26%20Laundries&geo_location_' \
                   'terms=New%20York%2C%20NY&page='

        for j in range(0, 1000, 1):
            try:
                url = base_url + str(j)
                print '[YellowPagesScraper] :: fetching data from url: ', url

                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print '[YellowPagesScraper] :: Failed to get the content ' \
                          'of url: %s' % url
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                for div in soup.find_all('div', class_='info'):
                    self.scrap_result_row(div)
                sleep_scrapper('YellowPagesScraper')
            except Exception as exp:
                print '[YellowPagesScraper] :: run() :: Got exception: %s' % exp
                print(traceback.format_exc())

    def scrap_result_row(self, div):

        try:
            h2 = div.find('h2', class_='n')

            title = div.find('a', class_='business-name').text.strip()

            print "[YellowPagesScraper] :: title: %s" % title

            rating_count = 0
            span = div.find('span', class_='count')

            if span:
                span = span.text.strip()
                rating_count = span

                print "[YellowPagesScraper] :: rating_count: %s" % rating_count

            p = div.find('p', class_='adr')
            address = p.text
            print "[YellowPagesScraper] :: address: %s" % address

            phone = ''
            li = div.find('li', class_='phone primary')
            if li:
                phone = li.text.strip()
                print "[YellowPagesScraper] :: phone: %s" % phone
            else:
                print "[YellowPagesScraper] :: phone: %s" % li

            categories = ''
            cat_div = div.find('div', class_='categories')
            if cat_div:
                categories = cat_div.text.strip()
                print "[YellowPagesScraper] :: categories: %s" % categories
            else:
                print "[YellowPagesScraper] :: categories: %s" % cat_div

            self.mdb.yellowpages_scraper_data(title, rating_count, address,
                                              phone, categories)

            fname = 'data_yellow_pages.csv'
            msg = "%s, %s, %s, %s, %s" % (title, rating_count, address, phone,
                                          categories)
            print "[YellowPagesScraper] :: scrap_result_row() :: msg:", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[YellowPagesScraper] :: scrap_result_row() :: ' \
                  'Got exception: %s' % exp
            print(traceback.format_exc())
Esempio n. 16
0
 def __init__(self, product, location):
     self.product = product.replace(" ", "+")
     self.location = location.replace(" ", "+")
     self.mdb = Mdb()
Esempio n. 17
0
class YelpScraper:
    def __init__(self, product, location):
        self.product = product.replace(" ", "+")
        self.location = location.replace(" ", "+")
        self.mdb = Mdb()

    def run(self):

        base_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s,+NY&start=" % (
            self.product, self.location)

        for j in range(1, 1000, 10):
            try:
                url = base_url + str(j)
                print '[YelpScraper] :: fetching data from url: ', url
                r = requests.get(url, headers=get_request_headers())

                if not r.status_code == 200:
                    print '[YelpScraper] :: Failed to get content of url: %s' % url
                    return

                html_doc = r.content
                soup = BeautifulSoup(html_doc, 'html.parser')

                for li in soup.find_all('li', class_='regular-search-result'):
                    self.scrap_row_yelp(li)
                sleep_scrapper('YelpScraper')
            except Exception as exp:
                print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp
                print(traceback.format_exc())

    def scrap_row_yelp(self, li):
        try:
            h3 = li.find('h3', class_='search-result-title')

            # Getting title
            title = ''
            spans = h3.find_all('span')
            i = 0
            for span in spans:
                i += 1
            if i == 2:
                title = span.text.strip()

            print "[YelpScraper] :: title: %s" % title

            # Getting reviews count
            reviews_count = 0
            span = li.find('span', class_='review-count rating-qualifier')
            text = span.text
            lst = text.split()
            reviews_count = int(lst[0])

            print "[YelpScraper] :: reviews count: %d" % reviews_count

            # Getting services
            services = []
            span = li.find('span', class_='category-str-list')
            text = span.text
            lst = text.split(',')
            services = [itm.strip() for itm in lst]

            print "[YelpScraper] :: services: %s" % services

            # Getting address
            address = li.find('address').text.strip()

            print "[YelpScraper] :: address: %s" % address

            # Getting phone
            phone = li.find('span', class_='biz-phone').text.strip()

            print "[YelpScraper] :: phone: %s" % phone

            # Getting snippet
            p = li.find('p', class_='snippet').text.strip()
            lst = p.split('read more')
            snippet = lst[0].strip()
            print "[YelpScraper] :: snippet: %s" % snippet

            self.mdb.yelp_scraper_data(title, reviews_count, services, address,
                                       phone, snippet)

            fname = 'data_yelp.csv'
            msg = "%s, %s, %s, %s, %s, %s" % (title, reviews_count, services,
                                              address, phone, snippet)
            print "[IndeedScrapper] :: scrap_result_row() :: CSV file Msg: ", msg
            scraper_csv_write(fname, msg)

        except Exception as exp:
            print '[YelpScraper] :: scrap_row_yelp() :: Got exception: %s' % exp
            print(traceback.format_exc())
class YellowPagesScraper:

    def __init__(self):
        self.mdb = Mdb()

    def run(self):
        base_url = 'https://www.yellowpages.com/search?search_terms=software+company&geo_location_terms=New+York%2C+NY&page='

        for j in range(27, 100, 1):
            try:
                url = base_url + str(j)
                print ('[YellowPagesScraper] :: fetching data from url: ', url)

                r = requests.get(url, headers=get_request_headers())
                if not r.status_code == 200:
                    print ('[YellowPagesScraper] :: Failed to get the content ' \
                          'of url: %s' % url)
                    return
                html_doc = r.content

                soup = BeautifulSoup(html_doc, 'html.parser')
                for div in soup.find_all('div', class_='info'):
                    self.scrap_result_row(div)
                sleep_scrapper('YellowPagesScraper')
            except Exception as exp:
                print ('[YellowPagesScraper] :: run() :: Got exception: %s' % exp)
                print(traceback.format_exc())

    def scrap_result_row(self, div):

        try:
 #get title of company
            h2 = div.find('h2', class_='n')
            title = h2.find('a', class_='business-name').text.strip()
            print ("[YellowPagesScraper] :: title: %s" % title)
 #get rating_count of company       
            rating_count = 0
            span = div.find('span', class_='count')
            if span:
                span = span.text.strip()
                rating_count = span
                print ("[YellowPagesScraper] :: rating_count: %s" % rating_count)
#get the address of the company
            
            p = div.find('p', class_='adr')
            street_address = p.find('div',class_='street-address')
            if street_address:
                str_adrs = street_address.text.strip()
                locality = p.find('div',class_='locality').text.strip()
                address = str_adrs + locality
                print ("[YellowPagesScraper] :: address: %s" % address)
            
                address=''
            
              else:  
#get the contact Number of Company
            phone = ''
            li = div.find('div', class_='phones phone primary')
            if li:
                phone = li.text.strip()
                print ("[YellowPagesScraper] :: phone: %s" % phone)
            else:
                print ("[YellowPagesScraper] :: phone: %s" % li)
#get the categories of company
            categories = ''
            cat_div = div.find('div', class_='categories')
            category = cat_div.find('a')
            if category:
                categories = cat_div.text.strip()
                print ("[YellowPagesScraper] :: categories: %s" % categories)
            else:
                print ("[YellowPagesScraper] :: categories: %s" % cat_div)
#get the website of the company
            
            div22=div.find('div',class_='info-section info-primary')
            webpage_link  = div22.find('a', {'class': 'track-visit-website'})
            if webpage_link:
                website_link = (webpage_link)['href']
                print('[YellowPagesScraper] :: webpage_link: %s' % website_link)
            else:
                website_link=''
                 

           
            self.mdb.yellowpages_scrapper_data(title, rating_count, address, phone, categories,website_link)

            fname = 'data_yellow_pages.csv'
            msg = "%s, %s, %s, %s, %s, %s" % (title, rating_count, address, phone, categories, website_link)
            #print ("[YellowPagesScraper] :: scrap_result_row() :: msg:", msg)
            scraper_csv_write(fname, msg)

         
        # except:
        #     pass
        except Exception as exp:
            
                
            print ('[YellowPagesScraper] :: scrap_result_row() :: ' \
             'Got exception: %s' % exp)
            print(traceback.format_exc())
 def __init__(self):
     self.mdb = Mdb()
Esempio n. 20
0
 def __init__(self):
     self.mdb = Mdb()
     self.python = Python()
     self.android = Android()
     self.php = Php()