def __init__(self, product_category, product_subcategory, product_title, product_code): self.mdb = Mdb() self.product_category = product_category self.product_subcategory = product_subcategory self.product_title = product_title self.product_code = product_code
class OverStockScraper: def __init__(self, product_category, product_code): self.mdb = Mdb() self.product_category = product_category self.product_code = product_code def run(self): url = '' try: base_url = 'https://www.overstock.com/Home-Garden/%s/%s/' \ % (self.product_category, self.product_code) sufix = 'subcat.html?page=' for j in range(1, 100, 1): url = base_url + sufix + str(j) print '[OverStockScraper] :: fetching data from url:', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[OverStockScraper] :: Failed to " \ "get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='product-tile'): # print '---------div', div self.scrap_result_row(div) # break sleep_scrapper('OverStockScraper') except Exception as exp: print '[OverStockScraper] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url) print(traceback.format_exc()) def scrap_result_row(self, div): try: div = div.find('div', class_='product-info') sub_div = div.find('div', class_='product-price-wrapper') price = sub_div.find('div', class_='product-price-container')\ .text.strip() print '[OverStockScraper] :: price: ', price title = div.find('div', class_='product-title').text.strip() print '[OverStockScraper] :: title: ', title rating = div.find('div', class_='product-footer') print '[OverStockScraper] :: rating: ', rating self.mdb.overstock_scraper_data(price, title, rating) fname = 'data_over_stock.csv' msg = "%s, %s, %s," % (price, title, rating) print "[OverStockScraper] :: scrap_result_row() :: msg:", msg scraper_csv_write(fname, msg) except Exception as exp: print '[OverStockScraper] :: scrap_result_row() :: ' \ 'Got exception: %s' % exp print(traceback.format_exc())
class GoogleNewsScraper: def __init__(self): self.mdb = Mdb() def run(self): try: url = 'https://news.google.com/news/headlines/section/topic' \ '/NATION.en_in/India?ned=in&hl=en-IN&gl=IN' print '[GoogleNewsScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[GoogleNewsScraper] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # print '------soup', soup for div in soup.find_all('div', class_='v4IxVd'): # print '-----div', div self.scrap_result_row(div) sleep_scrapper('GoogleNewsScraper') except Exception as exp: print '[GoogleNewsScraper] :: run() :: Got exception: %s'\ % exp print(traceback.format_exc()) def scrap_result_row(self, div): try: c_wiz = div.find('c-wiz', class_='M1Uqc kWyHVd') headlines = c_wiz.find('a', class_='nuEeue hzdq5d ME7ew')\ .text.strip() print '[GoogleNewsScraper] :: HeadLines: ', headlines div = div.find('div', class_='alVsqf') sub = div.find('div', class_='jJzAOb') c_wiz = sub.find('c-wiz', class_='M1Uqc MLSuAf') a = c_wiz.find('a', class_='nuEeue hzdq5d ME7ew').text.strip() print '[GoogleNewsScraper] :: SubheadLines: ', a # save in data base self.mdb.google_news_data(headlines, a) fname = 'data_google_news.csv' msg = "%s, %s" % (headlines, a) print "[GoogleNewsScraper] :: scrap_result_row() :: msg:", msg scraper_csv_write(fname, msg) except Exception as exp: print '[GoogleNewsScraper] :: scrap_result_row() :: ' \ 'Got exception : %s' % exp print(traceback.format_exc())
def __init__(self, product): self.product = product self.mdb = Mdb()
from flask import Flask, request, render_template, jsonify, session, url_for from functools import wraps # from flask.ext.bcrypt import Bcrypt # from wtforms.fields import SelectField from db import Mdb # import jwt # import datetime import json import traceback app = Flask(__name__) # bcrypt = Bcrypt(app) mdb = Mdb() app.config['secretkey'] = 'some-strong+secret#key' app.secret_key = 'F12Zr47j\3yX R~X@H!jmM]Lwf/,?KT' def sumSessionCounter(): try: session['counter'] += 1 except KeyError: session['counter'] = 1 def token_required(f): @wraps(f) def decorated(*args, **kwargs): token = request.args.get('token') if not token:
def __init__(self, domain, pos, location): self.domain = domain.replace(" ", "+") self.post = pos.replace(" ", "+") self.location = location.replace(" ", "+") self.mdb = Mdb()
class FlipkartScraper: def __init__(self, product): self.product = product self.mdb = Mdb() def run(self): try: base_url = 'https://www.flipkart.com/search?as=off&as-show=' \ 'on&otracker=start&page=' sufix = '&q=%s&viewType=list' % self.product for i in range(1, 100, 1): url = base_url + str(i) + sufix print '[FlipkartScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[FlipkartScraper] :: Failed to get the content ' \ 'of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # for div in soup.find_all('div', class_='col col-7-12'): for div in soup.find_all('div', class_='_1-2Iqu row'): # print '---------------------div', div self.scrap_result_row(div) sleep_scrapper('FlipkartScraper') except Exception as exp: print '[FlipkartScraper] :: run() :: Got exception: %s' % exp print(traceback.format_exc()) def scrap_result_row(self, div): try: Product_div = div.find('div', class_='col col-7-12') title = Product_div.find('div', class_='_3wU53n').text.strip() print '[FlipkartScraper] :: title . . . . ..:', title # title_description = div.find('div', class_='OiPjke').text.strip() # print'[FlipkartScraper] :: title_description: ', title_description rating = div.find('div', class_='niH0FQ') sub_rating = rating.find('span', class_='_38sUEc').text.strip() print '[FlipkartScraper] :: rating . . . . .:', sub_rating specifications_div = div.find('div', class_='_3ULzGw') specifications = specifications_div.find( 'ul', class_='vFw0gD').text.strip() print '[FlipkartScraper] :: specifications .: ', specifications product_price = div.find('div', class_='_6BWGkk') div_price = product_price.find('div', class_='_1uv9Cb') price = div_price.find('div', class_='_1vC4OE _2rQ-NK').text.strip() print '[FlipkartScraper] :: price . . . . . :', price self.mdb.flipkart_scraper_data(title, sub_rating, specifications, price) fname = 'data_flipkart.csv' msg = "%s, %s, %s, %s," % (title, sub_rating, specifications, price) print "[FlipkartScraper] :: scrap_result_row() :: msg:", msg scraper_csv_write(fname, msg) except Exception as exp: print '[FlipkartScraper] :: scrap_result_row() :: ' \ 'Got exception: %s' % exp print(traceback.format_exc())
def __init__(self, keyword): self.keyword = keyword self.mdb = Mdb()
class IndeedScrapper: def __init__(self, domain, pos, location): self.domain = domain.replace(" ", "+") self.post = pos.replace(" ", "+") self.location = location.replace(" ", "+") self.mdb = Mdb() def run(self): base_url = 'https://www.indeed.co%s/jobs?q=%s&l=%s&start=' % ( self.domain, self.post, self.location) for j in range(0, 1000, 10): url = '' try: url = base_url + str(j) print '[IndeedScrapper] :: fetching data from url:', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[IndeedScrapper] :: Failed to " \ "get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # print '----------soup', soup for div in soup.find_all('div'): # ignore divs with classes if not div.attrs.has_key('class'): continue cls = div.attrs['class'] if 'row' in cls and 'result' in cls: self.scrap_result_row(div) sleep_scrapper('IndeedScraper') except Exception as exp: print '[IndeedScraper] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url) def scrap_result_row(self, div): try: # title title = div.find('span', class_='company').text.strip() print "[IndeedScrapper] :: title: %s" % title # location span = div.find('span', class_='location') location = span.text.strip() print "[IndeedScrapper] :: location: %s" % location # salary sal = '' span = div.find('span', class_='no-wrap') if span: sal = span.text.strip() print "[IndeedScrapper] :: salary: %s" % sal # summary span = div.find('span', class_='summary') summary = span.text.strip() print "[IndeedScrapper] :: summery: %s" % summary self.mdb.indeed_scraper_data(title, location, sal, summary) fname = 'data_indeed.csv' msg = "%s, %s, %s, %s," % (title, location, sal, summary) print "[IndeedScrapper] :: scrap_result_row() :: msg:", msg scraper_csv_write(fname, msg) except Exception as exp: print '[IndeedScrapper] :: scrap_result_row() :: ' \ 'Got exception : %s' % exp
class HomeDepotScraper: def __init__(self, product): self.mdb = Mdb() self.product = product.replace(" ", "-") def run(self): base_url = 'https://www.homedepot.com/b/' \ '%s/N-5yc1vZbm79?Nao=' % (self.product) sufix = '&Ns=None' for j in range(0, 1000, 12): url = '' try: url = base_url + str(j) + sufix print '[HomeDepotScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[HomeDepotScraper] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='pod-inner'): self.scrap_result_row(div) sleep_scrapper('HomeDepotScraper') except Exception as exp: print '[HomeDepotScraper] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url) def scrap_result_row(self, div): try: # # name # name = div.find('div', class_='pod-plp__description js- # podclick-analytics') # a = name.find('a').strip() # print '[HomeDepotScraper] :: name: ', a # model model = div.find('div', class_='pod-plp__model').text.strip() print '[HomeDepotScraper] :: model: ', model # price price = div.find('div', class_='price').text.strip() print '[HomeDepotScraper] :: price: ', price # stock stock = div.find('div', class_='pod-plp__shipping-message__' 'wrapper-boss-bopis').text.strip() print '[HomeDepotScraper] :: stock: ', stock self.mdb.homedepot_data(model, price, stock) fname = 'data_home_depot.csv' msg = "%s, %s, %s," % (model, price, stock) print "[HomeDepotScraper] :: scrap_result_row() :: msg:", msg scraper_csv_write(fname, msg) except Exception as exp: print '[HomeDepotScraper] :: scrap_result_row() :: ' \ 'Got exception : %s' % exp
class GoogleSearchListingsScraper: def __init__(self, keyword): self.keyword = keyword self.mdb = Mdb() def run(self): try: options = Options() options.add_argument("window-size=1400,600") ua = UserAgent() a = ua.random user_agent = ua.random options.add_argument(f'user-agent={user_agent}') driver = webdriver.Chrome( "C:/Users/Dell/Downloads/chromedriver_win32/chromedriver.exe", options=options) html = driver.page_source for i in range(00, 60, 10): suffix = '&q=%s' % self.keyword url = 'https://www.google.com/search?client=firefox-b-d&biw=1366&bih=654&sa=N&ved=0ahUKEwjfy8ugnZHkAhVLro8KHXq1Ar0Q8tMDCJMC&ei=zslbXd-sHcvcvgT66oroCw&start=' + str( i) + suffix driver.get(url) html = driver.page_source # scrap_data(website_link,website_title,website_snippet) soup = BeautifulSoup(html, 'html.parser') print('...........soupppppppppp', soup.encode('utf-8')) for div in soup.find_all('div', class_='g'): print('---------------------div', div) self.scrap_result_row(div) time.sleep(15) sleep_scrapper('GoogleSearchListingsScraper') except Exception as exp: print( '[GoogleSearchListingsScraper] :: run() :: Got exception: %s' % exp) print(traceback.format_exc()) def scrape_result_now(self, div): try: website_link = [] website_snippet = [] website_title = [] results = div.find("div", {"class": ""}) for result in results: website_link.append( result.find("div", { "class": "title-bar-left" }).get_text().strip()) print('[GoogleSearchListingsScraper] :: address . . . . ..:', website_link) website_title.append( result.find("span", {"result-adress"}).get_text().strip()) print('[GoogleSearchListingsScraper] :: title . . . . ..:', website_title) website_snippet.append( result.find("div", { "class": "xl-desc" }).get_text().strip()) print( '[GoogleSearchListingsScraper] :: description . . . . ..:', website_snippet) self.mdb.google_listings_by_search_scrapper_data( website_link, website_title, website_snippet) #header row of csv file defined here df = pd.DataFrame({ "Title": website_title, "Address": website_link, "Description": website_snippet }) df.to_csv("output.csv") except Exception as exp: print( '[GoogleSearchListingsScraper] :: run() :: Got exception: %s' % exp) print(traceback.format_exc())
def __init__(self, product): self.mdb = Mdb() self.product = product.replace(" ", "-")
class Python: def __init__(self): self.mdb = Mdb() def scrap_result_row(self, div): ################################ # Company title # ################################ title = div.find('span', class_='company').text.strip() print "company title: %s" % title ################################ # Company location # ################################ span = div.find('span', class_='location') location = span.text.strip() print "company Location: %s" % location ################################ # Company salary # ################################ salary = '' span = div.find('span', class_='no-wrap') if span: salary = span.text.strip() print "Salary: %s" % salary else: print "salary: %s" % span ################################ # Company summary # ################################ span = div.find('span', class_='summary') summary = span.text.strip() print "Summery: %s" % summary self.mdb.add_vacancy_python(title, location, salary, summary) def scrap_python_developer(self, url): print "\nScrapping python Developer: %s \n" % url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "Failed to get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # parsing html content to fet information about python developer # for div in soup.find_all('div', class_='brdr'): for div in soup.find_all('div'): # ignore divs with classes if not div.attrs.has_key('class'): continue cls = div.attrs['class'] if 'row' in cls and 'result' in cls: self.scrap_result_row(div)
class BedBathAndBeyondScraper: def __init__(self, product_category, product_subcategory, product_title, product_code): self.mdb = Mdb() self.product_category = product_category self.product_subcategory = product_subcategory self.product_title = product_title self.product_code = product_code def run(self): try: url = 'https://www.bedbathandbeyond.com/store/category' \ '/%s/%s/%s/%s/' \ % (self.product_category, self.product_subcategory, self.product_title, self.product_code) print '[BedBathAndBeyondScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[BedBathAndBeyondScraper] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='productCo' 'ntent ec_listing'): self.scrap_result_row(div) sleep_scrapper('BedBathAndBeyondScraper') except Exception as exp: print '[BedBathAndBeyondScraper] :: run() :: Got exception: %s'\ % exp print(traceback.format_exc()) def scrap_result_row(self, div): try: div = div.find('div', class_='prodInfo') sub_div = div.find('div', class_='prodName') a = sub_div.find('a') print '[BedBathAndBeyondScraper] :: title: ', a.text.strip() div = div.find('div', class_='prodPrice') sub_div = div.find('div', class_='priceOfProduct') sub = sub_div.find('div', class_='isPrice') print '[BedBathAndBeyondScraper] :: price: ', sub.text.strip() self.mdb.bedbathandbeyond_scraper_data(a.text.strip(), sub.text.strip()) fname = 'data_bed_bath_and_beyond.csv' msg = "%s, %s," % (a.text.strip(), sub.text.strip()) print "[BedBathAndBeyondScraper] :: scrap_result_row() :: " \ "msg:", msg scraper_csv_write(fname, msg) except Exception as exp: print '[BedBathAndBeyondScraper] :: scrap_result_row() :: ' \ 'Got exception : %s' % exp print(traceback.format_exc())
class YellowPagesScraper: def __init__(self): self.mdb = Mdb() def run(self): base_url = 'https://www.yellowpages.com/search?search_terms=' \ 'Dry%20Cleaners%20%26%20Laundries&geo_location_' \ 'terms=New%20York%2C%20NY&page=' for j in range(0, 1000, 1): try: url = base_url + str(j) print '[YellowPagesScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[YellowPagesScraper] :: Failed to get the content ' \ 'of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='info'): self.scrap_result_row(div) sleep_scrapper('YellowPagesScraper') except Exception as exp: print '[YellowPagesScraper] :: run() :: Got exception: %s' % exp print(traceback.format_exc()) def scrap_result_row(self, div): try: h2 = div.find('h2', class_='n') title = div.find('a', class_='business-name').text.strip() print "[YellowPagesScraper] :: title: %s" % title rating_count = 0 span = div.find('span', class_='count') if span: span = span.text.strip() rating_count = span print "[YellowPagesScraper] :: rating_count: %s" % rating_count p = div.find('p', class_='adr') address = p.text print "[YellowPagesScraper] :: address: %s" % address phone = '' li = div.find('li', class_='phone primary') if li: phone = li.text.strip() print "[YellowPagesScraper] :: phone: %s" % phone else: print "[YellowPagesScraper] :: phone: %s" % li categories = '' cat_div = div.find('div', class_='categories') if cat_div: categories = cat_div.text.strip() print "[YellowPagesScraper] :: categories: %s" % categories else: print "[YellowPagesScraper] :: categories: %s" % cat_div self.mdb.yellowpages_scraper_data(title, rating_count, address, phone, categories) fname = 'data_yellow_pages.csv' msg = "%s, %s, %s, %s, %s" % (title, rating_count, address, phone, categories) print "[YellowPagesScraper] :: scrap_result_row() :: msg:", msg scraper_csv_write(fname, msg) except Exception as exp: print '[YellowPagesScraper] :: scrap_result_row() :: ' \ 'Got exception: %s' % exp print(traceback.format_exc())
def __init__(self, product, location): self.product = product.replace(" ", "+") self.location = location.replace(" ", "+") self.mdb = Mdb()
class YelpScraper: def __init__(self, product, location): self.product = product.replace(" ", "+") self.location = location.replace(" ", "+") self.mdb = Mdb() def run(self): base_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s,+NY&start=" % ( self.product, self.location) for j in range(1, 1000, 10): try: url = base_url + str(j) print '[YelpScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[YelpScraper] :: Failed to get content of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for li in soup.find_all('li', class_='regular-search-result'): self.scrap_row_yelp(li) sleep_scrapper('YelpScraper') except Exception as exp: print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp print(traceback.format_exc()) def scrap_row_yelp(self, li): try: h3 = li.find('h3', class_='search-result-title') # Getting title title = '' spans = h3.find_all('span') i = 0 for span in spans: i += 1 if i == 2: title = span.text.strip() print "[YelpScraper] :: title: %s" % title # Getting reviews count reviews_count = 0 span = li.find('span', class_='review-count rating-qualifier') text = span.text lst = text.split() reviews_count = int(lst[0]) print "[YelpScraper] :: reviews count: %d" % reviews_count # Getting services services = [] span = li.find('span', class_='category-str-list') text = span.text lst = text.split(',') services = [itm.strip() for itm in lst] print "[YelpScraper] :: services: %s" % services # Getting address address = li.find('address').text.strip() print "[YelpScraper] :: address: %s" % address # Getting phone phone = li.find('span', class_='biz-phone').text.strip() print "[YelpScraper] :: phone: %s" % phone # Getting snippet p = li.find('p', class_='snippet').text.strip() lst = p.split('read more') snippet = lst[0].strip() print "[YelpScraper] :: snippet: %s" % snippet self.mdb.yelp_scraper_data(title, reviews_count, services, address, phone, snippet) fname = 'data_yelp.csv' msg = "%s, %s, %s, %s, %s, %s" % (title, reviews_count, services, address, phone, snippet) print "[IndeedScrapper] :: scrap_result_row() :: CSV file Msg: ", msg scraper_csv_write(fname, msg) except Exception as exp: print '[YelpScraper] :: scrap_row_yelp() :: Got exception: %s' % exp print(traceback.format_exc())
class YellowPagesScraper: def __init__(self): self.mdb = Mdb() def run(self): base_url = 'https://www.yellowpages.com/search?search_terms=software+company&geo_location_terms=New+York%2C+NY&page=' for j in range(27, 100, 1): try: url = base_url + str(j) print ('[YellowPagesScraper] :: fetching data from url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ('[YellowPagesScraper] :: Failed to get the content ' \ 'of url: %s' % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='info'): self.scrap_result_row(div) sleep_scrapper('YellowPagesScraper') except Exception as exp: print ('[YellowPagesScraper] :: run() :: Got exception: %s' % exp) print(traceback.format_exc()) def scrap_result_row(self, div): try: #get title of company h2 = div.find('h2', class_='n') title = h2.find('a', class_='business-name').text.strip() print ("[YellowPagesScraper] :: title: %s" % title) #get rating_count of company rating_count = 0 span = div.find('span', class_='count') if span: span = span.text.strip() rating_count = span print ("[YellowPagesScraper] :: rating_count: %s" % rating_count) #get the address of the company p = div.find('p', class_='adr') street_address = p.find('div',class_='street-address') if street_address: str_adrs = street_address.text.strip() locality = p.find('div',class_='locality').text.strip() address = str_adrs + locality print ("[YellowPagesScraper] :: address: %s" % address) address='' else: #get the contact Number of Company phone = '' li = div.find('div', class_='phones phone primary') if li: phone = li.text.strip() print ("[YellowPagesScraper] :: phone: %s" % phone) else: print ("[YellowPagesScraper] :: phone: %s" % li) #get the categories of company categories = '' cat_div = div.find('div', class_='categories') category = cat_div.find('a') if category: categories = cat_div.text.strip() print ("[YellowPagesScraper] :: categories: %s" % categories) else: print ("[YellowPagesScraper] :: categories: %s" % cat_div) #get the website of the company div22=div.find('div',class_='info-section info-primary') webpage_link = div22.find('a', {'class': 'track-visit-website'}) if webpage_link: website_link = (webpage_link)['href'] print('[YellowPagesScraper] :: webpage_link: %s' % website_link) else: website_link='' self.mdb.yellowpages_scrapper_data(title, rating_count, address, phone, categories,website_link) fname = 'data_yellow_pages.csv' msg = "%s, %s, %s, %s, %s, %s" % (title, rating_count, address, phone, categories, website_link) #print ("[YellowPagesScraper] :: scrap_result_row() :: msg:", msg) scraper_csv_write(fname, msg) # except: # pass except Exception as exp: print ('[YellowPagesScraper] :: scrap_result_row() :: ' \ 'Got exception: %s' % exp) print(traceback.format_exc())
def __init__(self): self.mdb = Mdb()
def __init__(self): self.mdb = Mdb() self.python = Python() self.android = Android() self.php = Php()