from selectorlib import Extractor import requests import json from time import sleep # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('resources/search_results.yml') def scrape(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers) # Simple check to check if page was blocked (Usually 503)
def amazonScrape(hashtag, key): tmp1 = [] tmp3 = [] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('scripts/search_results.yml') headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.in/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests url = "https://www.amazon.in/s?k=" + hashtag print("Downloading %s" % url) r = requests.get(url, headers=headers, timeout=28) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print( "Page %s was blocked by Amazon. Please try using better proxies\n" % url) else: print( "Page %s must have been blocked by Amazon as the status code was %d" % (url, r.status_code)) print(r.status_code) return tmp1, tmp3 if e.extract(r.text) == None: print("None!") return tmp1, tmp3 else: if e.extract(r.text)['products'] != None: productfeed = e.extract(r.text)['products'] counter = 0 for product in productfeed: if counter > 3: break if product['price'] == None: print("Price is NONE") elif product['price'] != None: print(counter, "price NOT NONE") price = int(float(product['price'][1:].replace(',', ''))) print(price) if price >= 500: product_url = "https://www.amazon.in" + product['url'] counter += 1 print(product_url) tmp3.append(product_url) tmp1.append(key) return tmp1, tmp3
from selectorlib import Extractor import requests import json import argparse argparser = argparse.ArgumentParser() argparser.add_argument('url', help='Amazon Product Details URL') # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('selectors.yml') user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' headers = {'User-Agent': user_agent} # Download the page using requests args = argparser.parse_args() r = requests.get(args.url, headers=headers) # Pass the HTML of the page and create data = e.extract(r.text) # Print the data print(json.dumps(data, indent=True))
from selectorlib import Extractor import requests import json import time import csv e = Extractor.from_yaml_file( '/home/avi/Documents/work/price_tracker/Tracker/selector.yml') def scrape_(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } print("Downloading ", url) r = requests.get(url, headers=headers) if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text: print( "Page %s was blocked by Amazon. Please try using better proxies\n"
"document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "******", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36 OPR/71.0.3770.148" } ] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('selectors_review.yml') def scrape(url, counter): # If you want more you can loop through page numbers url = "https://www.amazon.com" + url.rstrip() + "&pageNumber=" + str( counter) headers = random.choice(headers_list) # Download the page using requests print("Downloading %s" % url) current_proxy = get_proxy() try: r = requests.get(url, headers=headers, proxies=current_proxy) except: print("Connection Refused") return None
import requests from selectorlib import Extractor from bs4 import BeautifulSoup e_product = Extractor.from_yaml_file('selectors.yml') def scrape_search(url): r = requests.get(url) # Simple check to check if page was blocked (Usually 503) if r.status_code > 500: return [] # Pass the HTML of the page and create soup = BeautifulSoup(r.text, "html.parser") links = [] for link in soup.select('a.a-link-normal.a-text-normal'): link_str = link.get('href') if link_str: links.append(link_str) return links def scrape_product(url): # Create an Extractor by reading from the YAML file HEADERS = ({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5' }) r = requests.get(url, headers=HEADERS) # Simple check to check if page was blocked (Usually 503)
import fake_useragent from selectorlib import Extractor import requests from fake_useragent import UserAgent import json from time import sleep # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('products.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests
import selectorlib from selectorlib import Extractor import requests import json from time import sleep from fake_useragent import UserAgent # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('scrape_monitor/search.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers)
from selectorlib import Extractor import requests import json import argparse argparser = argparse.ArgumentParser() argparser.add_argument('url', help='Amazon Product Details URL') # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('echo_dot_sandstone.yml') user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' headers = {'User-Agent': user_agent} # Download the page using requests args = argparser.parse_args() r = requests.get(args.url, headers=headers) # Pass the HTML of the page and create data = e.extract(r.text) # Print the data print(json.dumps(data, indent=True))
import requests import json from selectorlib import Extractor import argparse argparser = argparse.ArgumentParser() argparser.add_argument('url', help='Amazon Product Details URL') extd = Extractor.from_yaml_file('amazon.yml') # Extracting data from yml browser = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' headers = {'User-Agent': browser} args = argparser.parse_args() # Download Webpage with requests req = requests.get(args.url, headers=headers) res = extd.extract(req.text) print(json.dumps(res, indent=True)) # Dump to json file
from time import sleep import requests import yaml import re # Create your views here. from django.http import HttpResponse, JsonResponse from django.core.files.storage import FileSystemStorage from django.template.loader import render_to_string from django.urls import reverse from selectorlib import Extractor from polls.models import Output from polls.forms import OutputForm from datetime import datetime from django.contrib import messages e = Extractor.from_yaml_file('polls/selectors.yml') def index(request): return render(request, 'polls/base.html', {}) def is_valid_queryparam(param): return param != '' and param is not None def output(request): if request.method == 'GET' and 's_bt' in request.GET: alldata = Output.objects.all() brand_name_query = request.GET.get('brand_name') cpu_exact_query = request.GET.get('cpu_exact')
import selectorlib from selectorlib import Extractor import requests import json from time import sleep from fake_useragent import UserAgent # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('scrape_keyboard/search.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers)
from selectorlib import Extractor import requests import json import argparse argparser = argparse.ArgumentParser() argparser.add_argument('url', help='Amazon Product Details URL') # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('TribitSpeaker.txt') user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246' headers = {'User-Agent': user_agent} # Download the page using requests args = argparser.parse_args() r = requests.get(args.url, headers=headers) # Pass the HTML of the page and create data = e.extract(r.text) # Print the data #print(json.dumps(data, indent=True)) for item, detail in data.items(): print(item, ":", detail) #price = int(str(data['price'])[2:-3].replace(",", "")) #print(price) #if price < 8000:
from selectorlib import Extractor import requests import json from json2html import * from time import sleep import streamlit as st import pandas as pd import streamlit_theme as stt stt.set_theme({'primary': '#1b3388'}) st.title('Smart Price app') # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('Testing.yml') amazon_ext = Extractor.from_yaml_file('search_results.yml') #pd.set_option('display.max_colwidth',None) def scrape(url, str): headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******',
def extractor_update(self): if not self._extractor_update_: self._extractor_update_ = Extractor.from_yaml_file( self.path_update) return self._extractor_update_
def scrape(): e1 = Extractor.from_yaml_file('search_result.yml') e2 = Extractor.from_yaml_file('flip_results.yml') text = request.form.get("search_bar") print(text) url1 = "https://www.amazon.in/s?k={0}&rh=n%3A1375424031&ref=nb_sb_noss".format( text) url2 = "https://www.flipkart.com/search?q={0}&sid=6bo%2Cb5g&as=on&as-show=on&otracker=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&otracker1=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=macbook+pro%7CLaptops&requestId=4b1460e8-fcf5-4369-a655-a2501be025a8&as-backfill=on".format( text) r1 = requests.get(url1, headers=headers1) r2 = requests.get(url2, headers=headers2) sleep(2) data1 = e1.extract(r1.text) data2 = e2.extract(r2.text) product_title1 = [] product_price1 = [] product_img1 = [] product_url1 = [] product_title2 = [] product_price2 = [] product_img2 = [] product_url2 = [] i = 0 for product1 in data1.values(): for item1 in product1: product_title1.append(item1['title']) product_price1.append(item1['price']) product_img1.append(item1['image']) new_url1 = 'https://www.amazon.in' + item1['url'] product_url1.append(new_url1) asyncio.set_event_loop(asyncio.SelectorEventLoop()) data3 = asyncio.get_event_loop().run_until_complete( add_images_urls(data2, url2)) # data3 = asyncio.run(add_images_urls(data2, url2)) # data3 = await add_images_urls(data2, url2) # data3 = loop.run_until_complete(add_images_urls(data2, url2)) # data3 = add_images_urls(data2, url2) for product2 in data3.values(): for item2 in product2: product_title2.append(item2['title']) product_price2.append(item2['price']) product_img2.append(item2['image']) product_url2.append(item2['url']) # new_url2 = 'https://www.flipkart.com' + item2['url'] # product_url2.append(new_url2) # session = HTMLSession() # response = session.get(url2) # response.html.render(sleep=1, scrolldown=20) # # Container for each product being displayed # div = response.html.find('._1UoZlX') # for image in div: # img = image.find('img', first=True) # img_src = img.attrs['src'] # product_img2.append(img_src) return render_template("index2.html", title1=product_title1, price1=product_price1, img1=product_img1, url1=product_url1, title2=product_title2, price2=product_price2, img2=product_img2, url2=product_url2)
import selectorlib from selectorlib import Extractor import requests import json from time import sleep from fake_useragent import UserAgent # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('C:/Users/deepa/Desktop/Beginning/search.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers)
import fake_useragent from selectorlib import Extractor import requests from fake_useragent import UserAgent import json from time import sleep # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('scrape_mouse/products.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s"%url)
from selectorlib import Extractor import requests import json from time import sleep # Create an Extractor by reading from the YAML file #e = Extractor.from_yaml_file('selectors.yml') e = Extractor.from_yaml_file('feynman.yml') def scrape(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers) # Quick check to see if page was blocked (usually 503) if r.status_code > 500:
def model(request): fhandle = open('items.txt') text = fhandle.read() fhandle.close() li = text.split(',') d = dict() d["data"] = li # print(request.GET.get('model')) l = [] # result = [] # print(request.GET.get('model')) model = request.GET.get('model') queryset = str(deviceDetails.objects.get(pk=model)) # print("Queryset") # print(queryset) l = queryset.split('|||')[3] # result.append(queryset.split('---')[1]) # result.append(l.split(',')) # result.append(queryset.split('---')[3]) # result = zip(queryset.split('---')[1],l.split(','),queryset.split('---')[3]) data = Comments.objects.filter( mobile=get_object_or_404(deviceDetails, pk=model)) names = [] dates = [] comment_text = [] p_key = [] vote_count = [] up_voted = [] down_voted = [] delete_right = [] for row in data: # print(row.pk) p_key.append(row.pk) temp = str(row).split('---') # print(temp) names.append(temp[-3]) dates.append(temp[-1]) comment_text.append(temp[-2].split("||||")) if int(temp[-4]) == 0: vote_count.append(0) elif int(temp[-4]) < 0: vote_count.append(temp[-4]) else: vote_count.append('+' + temp[-4]) if request.session.get('user_name', 0) != 0: if (temp[-3] == request.session.get('user_name', 0)): delete_right.append(1) else: delete_right.append(0) OB = Votes.objects.filter(username=get_object_or_404( UserData, user_name=request.session['user_name']), comment=get_object_or_404(Comments, pk=row.pk)) if len(OB): # there will be always one row is he/she has already votes for tuple in OB: temp1 = str(tuple).split('---') if temp1[-2] == '1': up_voted.append(1) down_voted.append(0) else: down_voted.append(1) up_voted.append(0) else: up_voted.append(0) down_voted.append(0) else: up_voted.append(0) down_voted.append(0) delete_right.append(0) result = zip(names, dates, comment_text, p_key, vote_count, up_voted, down_voted, delete_right) mobile_name = queryset.split('|||')[1] # print(mobile_name) url = "http://flipkart.com/search?q=" + '%20'.join(mobile_name.split()) # print(url) data = requests.get(url).text soup = BeautifulSoup(data, 'lxml') rom = [ ', 8 GB', ', 16 GB', ', 32 GB', ', 64 GB', ', 128 GB', ', 256 GB', ', 512 GB', ', 1024 GB', ', 2048 GB' ] varient = [] price = [] flipkart_url = [] status = [] stars = [] ratings = [] reviews = [] for item in soup.find_all('a', class_="_1fQZEK"): name = item.find('div', class_="_4rR01T").text temp = name if '(' in name: # print(name+'---') name = name[:name.index('(')] if name[:-1] == mobile_name: for gb in rom: if gb in temp: rs = item.find('div', class_="_30jeq3").text current_status = "available" try: current_status = item.find('div', class_="_3G6awp").text except: pass varient.append(temp) price.append(rs[1:]) star = item.find('div', class_="_3LWZlK").text stars.append(star) # print(item.find('div',class_="_3LWZlK").text) rating = item.find('span', class_="_2_R_DZ").span.span.text ratings.append(rating) print(rating) text = str(item.find('span', class_="_2_R_DZ").span) # print(item.find('span',class_="_2_R_DZ").span) i = text.find('Reviews') - 2 review = "" while text[i] != '>': review += text[i] i -= 1 review = review[::-1] + ' Reviews' review = review.strip() reviews.append(review) # print(reviews) status.append(current_status) flipkart_url.append("https://www.flipkart.com" + item['href']) # print("https://www.flipkart.com"+item['href']) # print(temp,gb,rs[1:],status,"https://www.flipkart.com"+item['href']) break # for name,pr,st,link in zip(varient,price,status,flipkart_url): # print(name+" - "+pr+' - '+st+' - '+link) result1 = zip(varient, price, status, flipkart_url, stars, ratings, reviews) f = open("home/temp.txt", "r") data = f.readlines() e = Extractor.from_yaml_file('home/Amazon_selector.yml') headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } prefix = 'https://www.amazon.in/s?k=' device_name = mobile_name suffix = '&rh=n%3A1805560031&ref=nb_sb_noss' url = prefix + '+'.join(device_name.split()) + suffix print(url) r = requests.get(url, headers=headers) data1 = e.extract(r.text) # print(data1) while data1['result'] is None: r = requests.get(url, headers=headers) data1 = e.extract(r.text) temp = data1['result'] # print(temp) amazon_names = [] amazon_prices = [] amazon_ratings = [] amazon_totalratings = [] amazon_urls = [] for device in temp: try: if device_name in device['name'] and 'Case' not in device[ 'name'] and 'case' not in device['name']: print(device['name'] + ' - ' + device['price'][1:] + ' - https://amazon.in' + device['url'] + ' - ' + device['rating'] + ' - ' + device['total_ratings']) amazon_names.append(device['name']) amazon_prices.append(device['price'][1:]) amazon_ratings.append(device['rating']) amazon_totalratings.append(device['total_ratings']) amazon_urls.append('https://amazon.in' + device['url']) except: pass result2 = zip(amazon_names, amazon_prices, amazon_ratings, amazon_totalratings, amazon_urls) # print(amazon_names) # print(amazon_prices) # print(amazon_ratings) # print(amazon_totalratings) # print(amazon_urls) if request.session.get('user_name', 0) != 0: username = get_object_or_404(UserData, user_name=request.session['user_name']) email_verified = username.email_verified return render( request, "home/view.html", { 'title': 'Price Comparator | ' + device_name, 'email_verified': email_verified, 'list': dumps(d), 'Amazon_result': result2, 'Flipkart_result': result1, 'count': len(names), 'result': result, 'pk': model, 'login_flag': True, 'user_name': request.session['user_name'], 'name': queryset.split('|||')[1], 'image_url': queryset.split('|||')[2], 'spec': l.split('---') }) else: return render( request, "home/view.html", { 'title': 'Price Comparator | ' + device_name, 'login_flag': False, 'list': dumps(d), 'Amazon_result': result2, 'Flipkart_result': result1, 'count': len(names), 'result': result, 'pk': model, 'name': queryset.split('|||')[1], 'image_url': queryset.split('|||')[2][:-1], 'spec': l.split('---') })
import selectorlib from selectorlib import Extractor import requests import json from time import sleep from fake_useragent import UserAgent # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('search.yml') def scrape(url): ua = UserAgent() headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers)
import sys, os C = os.path.abspath(os.path.dirname(__file__)) from selectorlib import Extractor import requests from time import sleep import csv from datetime import datetime, timedelta from pprint import pprint # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file(os.path.join(C, 'booking.yml')) def scrape(url): headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', # You may want to change the user agent if you get blocked 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Referer': 'https://www.booking.com/index.en-gb.html', 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s"%url)
"Features", "Type", "URL", "URL Stays", "Beds", "Bedrooms", "Sleeps", "Map", "Description", "ID", "Best For", "Category" ] # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('properties.yml') p = Extractor.from_yaml_file('property.yml') def scrape(url): headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', # You may want to change the user agent if you get blocked 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Referer': 'https://www.booking.com/index.en-gb.html', 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
def product(name): url = "https://www.amazon.in/s?k=" + name e = Extractor.from_yaml_file('search.yml') r = requests.get(url, headers=headers) data = e.extract(r.text) return data
from time import sleep import pandas as pd import requests import json import re from pandas import json_normalize from selectorlib import Extractor from timeit import default_timer as timer from datetime import timedelta from datetime import datetime # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('venv/data/search_results.yml') def scrape(url): headers = { 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '******', 'sec-fetch-dest': 'document', 'referer': 'https://www.amazon.com/', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
def extractor_skus(self): if not self._extractor_skus_: self._extractor_skus_ = Extractor.from_yaml_file(self.path_skus) return self._extractor_skus_
from selectorlib import Extractor import requests import json from time import sleep e = Extractor.from_yaml_file("selectors.yml") def scrape(url): headers = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } # Download the page using requests print("Downloading %s" % url) r = requests.get(url, headers=headers) # Simle check to check if page was blocked if r.status_code > 500: if "To discuss automated access to Amazon data please contact" in r.text:
def extractor_new(self): if not self._extractor_new_: self._extractor_new_ = Extractor.from_yaml_file(self.path_new) return self._extractor_new_
else: print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code)) return None # Pass the HTML of the page and create return e.extract(r.text) def write_to_file(): with open("search_results_urls.txt",'r') as urllist, open('search_results_output.json','w') as outfile: count_products = 0 individual_products = 1 for url in urllist.read().splitlines(): data = scrape(url) if data: count_products += len(data['products']) for product in data['products']: product['search_url'] = url if individual_products <= product_count: print("Saving Product: %s"%product['title'].encode('utf8')) json.dump(product,outfile) outfile.write("\n") # sleep(5) else: print("We are done taking {} number of products".format(individual_products)) return individual_products += 1 # Create an Extractor by reading from the YAML file e = Extractor.from_yaml_file('search_results.yml') # Write to output files write_to_file()
'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } #Nvidia URLs nvidia_url = 'https://www.nvidia.com/en-us/geforce/graphics-cards/30-series/rtx-3080/' nvidia_yml = Extractor.from_yaml_file('NvidiaChecker.yml') #Newegg URLs newegg_url = 'https://www.newegg.com/p/pl?d=rtx+3080&N=100007709&isdeptsrh=1' newegg_yml = Extractor.from_yaml_file('NeweggChecker.yml') #BestBuy URLS bestbuy_url = 'https://www.bestbuy.com/site/nvidia-geforce-rtx-3080-10gb-gddr6x-pci-express-4-0-graphics-card-titanium-and-black/6429440.p?skuId=6429440' bestbuy_yml = Extractor.from_yaml_file('BestBuyChecker.yml') def nvidia_html_check(): nvidia_html = requests.get(nvidia_url, headers=headers) t = 1 if nvidia_html.status_code != 200: while nvidia_html.status_code != 200: