import fake_useragent from selectorlib import Extractor import requests from fake_useragent import UserAgent import json from time import sleep # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('scrape_mouse/products.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s"%url)
import selectorlib from selectorlib import Extractor import requests import json from time import sleep from fake_useragent import UserAgent # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('search.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers)
import requests from selectorlib import Extractor # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file("booking.yml") def create_url(people, place, datein, dateout, offset): url_list = [] for i in range(1, offset, 25): url = ( "https://www.booking.com/searchresults.en-gb.html?checkin_month={in_month}" "&checkin_monthday={in_day}&checkin_year={in_year}&checkout_month={out_month}" "&checkout_monthday={out_day}&checkout_year={out_year}&group_adults={people}" "&group_children=0&order=review_score_and_price&ss={place}&offset={offset}" .format( in_month=str(datein.month), in_day=str(datein.day), in_year=str(datein.year), out_month=str(dateout.month), out_day=str(dateout.day), out_year=str(dateout.year), people=people, place=place, offset=i, )) url_list.append(url)
def format_response(response): formatters = Formatter.get_all() extractor = Extractor.from_yaml_file('./scrapper.yaml', formatters=formatters) data = extractor.extract(response.text) return data
from selectorlib import Extractor import requests import json from time import sleep # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('selectors.yml') def scrape(url): headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests # print("Downloading %s"%url) r = requests.get(url, headers=headers) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
def scrape(url): e = Extractor.from_yaml_file("selectors.yml") ua = UserAgent() prod_id_regex = re.compile(r".*/([a-zA-Z0-9]{10})(?:[/?]|$).*") product_id = prod_id_regex.match(url).group(1) headers = { "authority": "www.amazon.com", "pragma": "no-cache", "cache-control": "no-cache", "dnt": "1", "upgrade-insecure-requests": "1", "user-agent": ua.random, "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "sec-fetch-site": "none", "sec-fetch-mode": "navigate", "sec-fetch-dest": "document", "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", } r = requests.get(url, headers=headers) retries = 0 while "captcha" in r.text and retries < 10: print("user agent failed, trying new one") headers["user-agent"] = ua.random r = requests.get(url, headers=headers) retries += 1 if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print( "Page %s was blocked by Amazon. Please try using better proxies\n" % url) else: print( "Page %s must have been blocked by Amazon as the status code was %d" % (url, r.status_code)) return # Pass the HTML of the page and create data = e.extract(r.text) category = data["product_category"] images = data["product_images"][1:-1].split("],") images = [x.split(":[")[0][1:-1] for x in images] out_reviews = [] for review in data["reviews"]: r = {} r["rating"] = float(review["rating"][:3]) r["product_category"] = category r["verified"] = "N" if review["verified"] is None else "Y" r["review_text"] = review["content"] out_reviews.append(r) out_data = {} out_data["title"] = data["product_title"] out_data["id"] = product_id out_data["price"] = data["product_price"] out_data["image"] = images[-1] return out_reviews, out_data
from selectorlib import Extractor import requests import json from time import sleep # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('search_results.yml') def scrape(url): headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500:
def extractor_update(self): if not self._extractor_update_: self._extractor_update_ = Extractor.from_yaml_file( self.path_update) return self._extractor_update_
import selectorlib from selectorlib import Extractor import requests import json from time import sleep from fake_useragent import UserAgent # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('C:/Users/deepa/Desktop/Beginning/search.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers)
def extractor_skus(self): if not self._extractor_skus_: self._extractor_skus_ = Extractor.from_yaml_file(self.path_skus) return self._extractor_skus_
def extractor_new(self): if not self._extractor_new_: self._extractor_new_ = Extractor.from_yaml_file(self.path_new) return self._extractor_new_
def product(name): url = "https://www.amazon.in/s?k=" + name e = Extractor.from_yaml_file('search.yml') r = requests.get(url, headers=headers) data = e.extract(r.text) return data
import sys, os C = os.path.abspath(os.path.dirname(__file__)) from selectorlib import Extractor import requests from time import sleep import csv from datetime import datetime, timedelta from pprint import pprint # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file(os.path.join(C, 'booking.yml')) def scrape(url): headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', # You may want to change the user agent if you get blocked 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Referer': 'https://www.booking.com/index.en-gb.html', 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s"%url)
def model(request): fhandle = open('items.txt') text = fhandle.read() fhandle.close() li = text.split(',') d = dict() d["data"] = li # print(request.GET.get('model')) l = [] # result = [] # print(request.GET.get('model')) model = request.GET.get('model') queryset = str(deviceDetails.objects.get(pk=model)) # print("Queryset") # print(queryset) l = queryset.split('|||')[3] # result.append(queryset.split('---')[1]) # result.append(l.split(',')) # result.append(queryset.split('---')[3]) # result = zip(queryset.split('---')[1],l.split(','),queryset.split('---')[3]) data = Comments.objects.filter( mobile=get_object_or_404(deviceDetails, pk=model)) names = [] dates = [] comment_text = [] p_key = [] vote_count = [] up_voted = [] down_voted = [] delete_right = [] for row in data: # print(row.pk) p_key.append(row.pk) temp = str(row).split('---') # print(temp) names.append(temp[-3]) dates.append(temp[-1]) comment_text.append(temp[-2].split("||||")) if int(temp[-4]) == 0: vote_count.append(0) elif int(temp[-4]) < 0: vote_count.append(temp[-4]) else: vote_count.append('+' + temp[-4]) if request.session.get('user_name', 0) != 0: if (temp[-3] == request.session.get('user_name', 0)): delete_right.append(1) else: delete_right.append(0) OB = Votes.objects.filter(username=get_object_or_404( UserData, user_name=request.session['user_name']), comment=get_object_or_404(Comments, pk=row.pk)) if len(OB): # there will be always one row is he/she has already votes for tuple in OB: temp1 = str(tuple).split('---') if temp1[-2] == '1': up_voted.append(1) down_voted.append(0) else: down_voted.append(1) up_voted.append(0) else: up_voted.append(0) down_voted.append(0) else: up_voted.append(0) down_voted.append(0) delete_right.append(0) result = zip(names, dates, comment_text, p_key, vote_count, up_voted, down_voted, delete_right) mobile_name = queryset.split('|||')[1] # print(mobile_name) url = "http://flipkart.com/search?q=" + '%20'.join(mobile_name.split()) # print(url) data = requests.get(url).text soup = BeautifulSoup(data, 'lxml') rom = [ ', 8 GB', ', 16 GB', ', 32 GB', ', 64 GB', ', 128 GB', ', 256 GB', ', 512 GB', ', 1024 GB', ', 2048 GB' ] varient = [] price = [] flipkart_url = [] status = [] stars = [] ratings = [] reviews = [] for item in soup.find_all('a', class_="_1fQZEK"): name = item.find('div', class_="_4rR01T").text temp = name if '(' in name: # print(name+'---') name = name[:name.index('(')] if name[:-1] == mobile_name: for gb in rom: if gb in temp: rs = item.find('div', class_="_30jeq3").text current_status = "available" try: current_status = item.find('div', class_="_3G6awp").text except: pass varient.append(temp) price.append(rs[1:]) star = item.find('div', class_="_3LWZlK").text stars.append(star) # print(item.find('div',class_="_3LWZlK").text) rating = item.find('span', class_="_2_R_DZ").span.span.text ratings.append(rating) print(rating) text = str(item.find('span', class_="_2_R_DZ").span) # print(item.find('span',class_="_2_R_DZ").span) i = text.find('Reviews') - 2 review = "" while text[i] != '>': review += text[i] i -= 1 review = review[::-1] + ' Reviews' review = review.strip() reviews.append(review) # print(reviews) status.append(current_status) flipkart_url.append("https://www.flipkart.com" + item['href']) # print("https://www.flipkart.com"+item['href']) # print(temp,gb,rs[1:],status,"https://www.flipkart.com"+item['href']) break # for name,pr,st,link in zip(varient,price,status,flipkart_url): # print(name+" - "+pr+' - '+st+' - '+link) result1 = zip(varient, price, status, flipkart_url, stars, ratings, reviews) f = open("home/temp.txt", "r") data = f.readlines() e = Extractor.from_yaml_file('home/Amazon_selector.yml') headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } prefix = 'https://www.amazon.in/s?k=' device_name = mobile_name suffix = '&rh=n%3A1805560031&ref=nb_sb_noss' url = prefix + '+'.join(device_name.split()) + suffix print(url) r = requests.get(url, headers=headers) data1 = e.extract(r.text) # print(data1) while data1['result'] is None: r = requests.get(url, headers=headers) data1 = e.extract(r.text) temp = data1['result'] # print(temp) amazon_names = [] amazon_prices = [] amazon_ratings = [] amazon_totalratings = [] amazon_urls = [] for device in temp: try: if device_name in device['name'] and 'Case' not in device[ 'name'] and 'case' not in device['name']: print(device['name'] + ' - ' + device['price'][1:] + ' - https://amazon.in' + device['url'] + ' - ' + device['rating'] + ' - ' + device['total_ratings']) amazon_names.append(device['name']) amazon_prices.append(device['price'][1:]) amazon_ratings.append(device['rating']) amazon_totalratings.append(device['total_ratings']) amazon_urls.append('https://amazon.in' + device['url']) except: pass result2 = zip(amazon_names, amazon_prices, amazon_ratings, amazon_totalratings, amazon_urls) # print(amazon_names) # print(amazon_prices) # print(amazon_ratings) # print(amazon_totalratings) # print(amazon_urls) if request.session.get('user_name', 0) != 0: username = get_object_or_404(UserData, user_name=request.session['user_name']) email_verified = username.email_verified return render( request, "home/view.html", { 'title': 'Price Comparator | ' + device_name, 'email_verified': email_verified, 'list': dumps(d), 'Amazon_result': result2, 'Flipkart_result': result1, 'count': len(names), 'result': result, 'pk': model, 'login_flag': True, 'user_name': request.session['user_name'], 'name': queryset.split('|||')[1], 'image_url': queryset.split('|||')[2], 'spec': l.split('---') }) else: return render( request, "home/view.html", { 'title': 'Price Comparator | ' + device_name, 'login_flag': False, 'list': dumps(d), 'Amazon_result': result2, 'Flipkart_result': result1, 'count': len(names), 'result': result, 'pk': model, 'name': queryset.split('|||')[1], 'image_url': queryset.split('|||')[2][:-1], 'spec': l.split('---') })
from selectorlib import Extractor import requests import json from time import sleep e = Extractor.from_yaml_file("selectors.yml") def scrape(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers) # Simle check to check if page was blocked if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text:
from selectorlib import Extractor import requests import json from time import sleep # Create an Extractor by reading from the YAML file #e = Extractor.from_yaml_file('selectors.yml') e = Extractor.from_yaml_file('feynman.yml') def scrape(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers) # Quick check to see if page was blocked (usually 503) if r.status_code > 500:
#! python3 import requests, bs4, pyperclip, openpyxl, mechanize from selectorlib import Extractor clipboard = str(pyperclip.paste()) clipboard_list = clipboard.splitlines() # initialize simulated chrome browser chrome = mechanize.Browser() chrome.set_handle_robots(False) chrome.addheaders = [( 'User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36' )] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('Amazon_Sellers.yml') for i in range(len(clipboard_list)): asin = clipboard_list[i] asin_url = 'https://www.amazon.com/gp/offer-listing/' + asin + '/' # Yurl = 'http://api.scraperapi.com/?api_key=f0759471ae73755feab08ef9ac6e299d&url=' + asin_url res = chrome.open(asin_url) #asin_html = bs4.BeautifulSoup(res.text, 'html.parser') print(e.extract(res))
import fake_useragent from selectorlib import Extractor import requests from fake_useragent import UserAgent import json from time import sleep # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('products.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests
import selectorlib from selectorlib import Extractor import requests import json from time import sleep from fake_useragent import UserAgent # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('scrape_smartphone/search.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers)
from selectorlib import Extractor import requests import json import time import csv e = Extractor.from_yaml_file( '/home/avi/Documents/work/price_tracker/Tracker/selector.yml') def scrape_(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } print("Downloading ", url) r = requests.get(url, headers=headers) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print( "Page %s was blocked by Amazon. Please try using better proxies\n"
def createExtractor(self): # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('amazon-scraper/selectors.yml') return e
"Features", "Type", "URL", "URL Stays", "Beds", "Bedrooms", "Sleeps", "Map", "Description", "ID", "Best For", "Category" ] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('properties.yml') p = Extractor.from_yaml_file('property.yml') def scrape(url): headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', # You may want to change the user agent if you get blocked 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Referer': 'https://www.booking.com/index.en-gb.html', 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
class AmazonScraper(WebshopScraper): name = "amazon_scraper" # Proxy API_KEY = "9244ba171bff5bb2139d5403c443ee87" scraper_proxy = "http://*****:*****@proxy-server.scraperapi.com:8001".format( API_KEY) # HTML extractor selector_file = 'webshop_scraper/amazon_selectors.yml' extractor = Extractor.from_yaml_file(selector_file) # HTML image regex image_pattern = re.compile(r'\'initial\': ' '(.*?)' r'\},\n', re.DOTALL) def __init__(self, n_pages=None, product_save_dir=None, scraped_urls_file=None, include_variants=True): super().__init__(n_pages=n_pages, product_save_dir=product_save_dir, scraped_urls_file=scraped_urls_file, include_variants=include_variants) def get_start_urls(self): urls = [ # "https://www.amazon.co.uk/s?rh=n%3A468292%2Cp_72%3A4-&pf_rd_i=468292&pf_rd_p=d40c144e-45ba-5915-b01d-d92bd82e9a59&pf_rd_r=9AHN48N59BT4GF71E1G8&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # toys # "https://www.amazon.co.uk/s?rh=n%3A117332031%2Cp_72%3A4-&pf_rd_i=117332031&pf_rd_p=4c8654cd-5980-5a4f-a532-3db1a3a6d579&pf_rd_r=AWW9M71158D9EAAAR8KB&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # beauty # "https://www.amazon.co.uk/s?i=sports&rh=n%3A461182031%2Cp_72%3A184323031&pf_rd_i=461182031&pf_rd_p=e9bb2e37-191c-532b-9180-73d951e30279&pf_rd_r=8R8CEN6NH34W6VM2610X&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # fit watches # "https://www.amazon.co.uk/s?rh=n%3A5866054031%2Cp_72%3A4-&pf_rd_i=5866054031&pf_rd_p=4ad30a04-262e-55f5-a315-4c86a63048cb&pf_rd_r=WTMSQNPE0Z6818GG7NMB&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # utilities / science # "https://www.amazon.com/s?i=specialty-aps&bbn=16225013011&rh=n%3A%2116225013011%2Cn%3A2975312011&ref=nav_em_0_2_14_2__nav_desktop_sa_intl_dogs", # dog # "https://www.amazon.co.uk/s?i=sports&rh=n%3A318949011%2Cp_72%3A184323031&pf_rd_i=318949011&pf_rd_p=b052e5ee-b3e8-5fa8-b467-a83cf0dcb513&pf_rd_r=JTXNEV8HM50Z5S18AEQ3&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # sports equipment ] return urls def get_product_pages(self, response): products = response.xpath( "//span[@class='rush-component' and @data-component-type='s-product-image']" ).xpath("a").xpath("@href").getall() products = ["/".join(url.split("/")[:4]) for url in products] products = [response.urljoin(url) for url in products] return products def get_product_info(self, response): html = response.text return self.extractor.extract(html) def get_product_image_urls(self, response): html = response.text try: data = self.image_pattern.search(html) images = data.group(1) except AttributeError: # no image carousel in page return [] images = json.loads(images) images = [img["hiRes"] for img in images] return list(filter(None, images)) def get_next_page_url(self, response): next_page_url = response.xpath( "//ul[@class='a-pagination']/li[@class='a-last']/a").xpath( "@href").get() next_page_url = response.urljoin(next_page_url) return next_page_url
"document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "******", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36 OPR/71.0.3770.148" } ] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('selectors_review.yml') def scrape(url, counter): # If you want more you can loop through page numbers url = "https://www.amazon.com" + url.rstrip() + "&pageNumber=" + str( counter) headers = random.choice(headers_list) # Download the page using requests print("Downloading %s" % url) current_proxy = get_proxy() try: r = requests.get(url, headers=headers, proxies=current_proxy) except: print("Connection Refused") return None
from selectorlib import Extractor import requests import re e = Extractor.from_yaml_file('scraper_app/static/selectors/selectors.yml') def scrape(url): headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } r = requests.get(url, headers=headers) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print( "Page %s was blocked by Amazon. Please try using better proxies\n" % url)
from time import sleep import pandas as pd import requests import json import re from pandas import json_normalize from selectorlib import Extractor from timeit import default_timer as timer from datetime import timedelta from datetime import datetime # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('venv/data/search_results.yml') def scrape(url): headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "******", "Sec-Fetch-Dest": "document", "Referer": "https://www.google.com/", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9" } ] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('selectors_product.yml') def scrape(url): # If you want more you can loop through page numbers url = "https://www.amazon.com" + url.rstrip() headers = random.choice(headers_list) # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print( "Page %s was blocked by Amazon. Please try using better proxies\n" % url)
def amazonScrape(hashtag, key): tmp1 = [] tmp3 = [] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('scripts/search_results.yml') headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.in/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests url = "https://www.amazon.in/s?k=" + hashtag print("Downloading %s" % url) r = requests.get(url, headers=headers, timeout=28) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print( "Page %s was blocked by Amazon. Please try using better proxies\n" % url) else: print( "Page %s must have been blocked by Amazon as the status code was %d" % (url, r.status_code)) print(r.status_code) return tmp1, tmp3 if e.extract(r.text) == None: print("None!") return tmp1, tmp3 else: if e.extract(r.text)['products'] != None: productfeed = e.extract(r.text)['products'] counter = 0 for product in productfeed: if counter > 3: break if product['price'] == None: print("Price is NONE") elif product['price'] != None: print(counter, "price NOT NONE") price = int(float(product['price'][1:].replace(',', ''))) print(price) if price >= 500: product_url = "https://www.amazon.in" + product['url'] counter += 1 print(product_url) tmp3.append(product_url) tmp1.append(key) return tmp1, tmp3
from random import randint from pathlib import Path import time from selectorlib import Extractor import requests import json from playsound import playsound from pygame import mixer chosen_turtle = "" # the turtle which the user bets on bet_amount = 0 # the betting amount guessed_steps = 0 # guessed number of steps of the winning turtle finished_race_info = [ ] # info about the race : winning turtle, colour of winning turtle, and the number of steps it took turtle_step_counter = [0, 0, 0, 0] # create a list of steps of turtles e = Extractor.from_yaml_file( 'selectors.yml') # Create an Extractor by reading from the YAML file budget = 0 wins_and_losses = [] # a list of wins and losses of user for each round turtle_colours = ['green', 'purple', 'blue', 'orange'] rounds_counter = 0 # counts the number of rounds played #set up the screen width and height wh = Screen() wh.setup(width=1.0, height=1.0) onscreenclick #create an instance of turtle for writing messages message = Turtle() message.hideturtle() message.speed('fastest') message.color('black') message.style = ('Courier', 30, 'italic')
def scrape(): e1 = Extractor.from_yaml_file('search_result.yml') e2 = Extractor.from_yaml_file('flip_results.yml') text = request.form.get("search_bar") print(text) url1 = "https://www.amazon.in/s?k={0}&rh=n%3A1375424031&ref=nb_sb_noss".format( text) url2 = "https://www.flipkart.com/search?q={0}&sid=6bo%2Cb5g&as=on&as-show=on&otracker=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&otracker1=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=macbook+pro%7CLaptops&requestId=4b1460e8-fcf5-4369-a655-a2501be025a8&as-backfill=on".format( text) r1 = requests.get(url1, headers=headers1) r2 = requests.get(url2, headers=headers2) sleep(2) data1 = e1.extract(r1.text) data2 = e2.extract(r2.text) product_title1 = [] product_price1 = [] product_img1 = [] product_url1 = [] product_title2 = [] product_price2 = [] product_img2 = [] product_url2 = [] i = 0 for product1 in data1.values(): for item1 in product1: product_title1.append(item1['title']) product_price1.append(item1['price']) product_img1.append(item1['image']) new_url1 = 'https://www.amazon.in' + item1['url'] product_url1.append(new_url1) asyncio.set_event_loop(asyncio.SelectorEventLoop()) data3 = asyncio.get_event_loop().run_until_complete( add_images_urls(data2, url2)) # data3 = asyncio.run(add_images_urls(data2, url2)) # data3 = await add_images_urls(data2, url2) # data3 = loop.run_until_complete(add_images_urls(data2, url2)) # data3 = add_images_urls(data2, url2) for product2 in data3.values(): for item2 in product2: product_title2.append(item2['title']) product_price2.append(item2['price']) product_img2.append(item2['image']) product_url2.append(item2['url']) # new_url2 = 'https://www.flipkart.com' + item2['url'] # product_url2.append(new_url2) # session = HTMLSession() # response = session.get(url2) # response.html.render(sleep=1, scrolldown=20) # # Container for each product being displayed # div = response.html.find('._1UoZlX') # for image in div: # img = image.find('img', first=True) # img_src = img.attrs['src'] # product_img2.append(img_src) return render_template("index2.html", title1=product_title1, price1=product_price1, img1=product_img1, url1=product_url1, title2=product_title2, price2=product_price2, img2=product_img2, url2=product_url2)