Python Extractor Examples, selectorlib.Extractor Python Examples

Example #1

0

Show file

from selectorlib import Extractor
import requests
import json
from time import sleep

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('resources/search_results.yml')


def scrape(url):

    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)
    # Simple check to check if page was blocked (Usually 503)

Example #2

0

Show file

def amazonScrape(hashtag, key):
    tmp1 = []
    tmp3 = []

    # Create an Extractor by reading from the YAML file
    e = Extractor.from_yaml_file('scripts/search_results.yml')
    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.in/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    url = "https://www.amazon.in/s?k=" + hashtag
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers, timeout=28)
    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print(
                "Page %s was blocked by Amazon. Please try using better proxies\n"
                % url)
        else:
            print(
                "Page %s must have been blocked by Amazon as the status code was %d"
                % (url, r.status_code))
        print(r.status_code)
        return tmp1, tmp3

    if e.extract(r.text) == None:
        print("None!")
        return tmp1, tmp3

    else:
        if e.extract(r.text)['products'] != None:
            productfeed = e.extract(r.text)['products']
            counter = 0
            for product in productfeed:
                if counter > 3: break
                if product['price'] == None: print("Price is NONE")
                elif product['price'] != None:
                    print(counter, "price NOT NONE")
                    price = int(float(product['price'][1:].replace(',', '')))
                    print(price)
                    if price >= 500:
                        product_url = "https://www.amazon.in" + product['url']
                        counter += 1
                        print(product_url)
                        tmp3.append(product_url)
                        tmp1.append(key)

    return tmp1, tmp3

Example #3

0

Show file

File: selector.py Project: bryanwills/Python-Projects

from selectorlib import Extractor
import requests
import json
import argparse

argparser = argparse.ArgumentParser()
argparser.add_argument('url', help='Amazon Product Details URL')

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('selectors.yml')

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
headers = {'User-Agent': user_agent}

# Download the page using requests
args = argparser.parse_args()
r = requests.get(args.url, headers=headers)
# Pass the HTML of the page and create
data = e.extract(r.text)
# Print the data
print(json.dumps(data, indent=True))

Example #4

0

Show file

File: amazon.py Project: avitomar12/price_tracker

from selectorlib import Extractor
import requests
import json
import time
import csv
e = Extractor.from_yaml_file(
    '/home/avi/Documents/work/price_tracker/Tracker/selector.yml')


def scrape_(url):
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }
    print("Downloading  ", url)
    r = requests.get(url, headers=headers)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print(
                "Page %s was blocked by Amazon. Please try using better proxies\n"

Example #5

0

Show file

        "document",
        "Sec-Fetch-Mode":
        "navigate",
        "Sec-Fetch-Site":
        "none",
        "Sec-Fetch-User":
        "******",
        "Upgrade-Insecure-Requests":
        "1",
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36 OPR/71.0.3770.148"
    }
]

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('selectors_review.yml')


def scrape(url, counter):
    # If you want more you can loop through page numbers
    url = "https://www.amazon.com" + url.rstrip() + "&pageNumber=" + str(
        counter)
    headers = random.choice(headers_list)
    # Download the page using requests
    print("Downloading %s" % url)
    current_proxy = get_proxy()
    try:
        r = requests.get(url, headers=headers, proxies=current_proxy)
    except:
        print("Connection Refused")
        return None

Example #6

0

Show file

File: helpers.py Project: shubhamqweasd/scraper

import requests
from selectorlib import Extractor
from bs4 import BeautifulSoup

e_product = Extractor.from_yaml_file('selectors.yml')


def scrape_search(url):
    r = requests.get(url)
    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:
        return []
    # Pass the HTML of the page and create
    soup = BeautifulSoup(r.text, "html.parser")
    links = []
    for link in soup.select('a.a-link-normal.a-text-normal'):
        link_str = link.get('href')
        if link_str:
            links.append(link_str)
    return links


def scrape_product(url):
    # Create an Extractor by reading from the YAML file
    HEADERS = ({
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    })
    r = requests.get(url, headers=HEADERS)
    # Simple check to check if page was blocked (Usually 503)

Example #7

0

Show file

import fake_useragent
from selectorlib import Extractor
import requests
from fake_useragent import UserAgent

import json
from time import sleep

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('products.yml')


def scrape(url):

    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests

Example #8

0

Show file

File: search.py Project: shekharbiswas/ScrapeAmazon

import selectorlib
from selectorlib import Extractor
import requests
import json
from time import sleep
from fake_useragent import UserAgent

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('scrape_monitor/search.yml')


def scrape(url):
    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)

Example #9

0

Show file

from selectorlib import Extractor
import requests
import json
import argparse

argparser = argparse.ArgumentParser()
argparser.add_argument('url', help='Amazon Product Details URL')

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('echo_dot_sandstone.yml')

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
headers = {'User-Agent': user_agent}

# Download the page using requests
args = argparser.parse_args()
r = requests.get(args.url, headers=headers)
# Pass the HTML of the page and create
data = e.extract(r.text)
# Print the data
print(json.dumps(data, indent=True))

Example #10

0

Show file

File: scraper.py Project: ashishjoshi99/Single-Amazon-Scraper

import requests
import json
from selectorlib import Extractor
import argparse

argparser = argparse.ArgumentParser()
argparser.add_argument('url', help='Amazon Product Details URL')

extd = Extractor.from_yaml_file('amazon.yml')  # Extracting data from yml

browser = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
headers = {'User-Agent': browser}

args = argparser.parse_args()  # Download Webpage with requests
req = requests.get(args.url, headers=headers)

res = extd.extract(req.text)

print(json.dumps(res, indent=True))  # Dump to json file

Example #11

0

Show file

from time import sleep
import requests
import yaml
import re
# Create your views here.
from django.http import HttpResponse, JsonResponse
from django.core.files.storage import FileSystemStorage
from django.template.loader import render_to_string
from django.urls import reverse
from selectorlib import Extractor
from polls.models import Output
from polls.forms import OutputForm
from datetime import datetime
from django.contrib import messages

e = Extractor.from_yaml_file('polls/selectors.yml')


def index(request):
    return render(request, 'polls/base.html', {})


def is_valid_queryparam(param):
    return param != '' and param is not None


def output(request):
    if request.method == 'GET' and 's_bt' in request.GET:
        alldata = Output.objects.all()
        brand_name_query = request.GET.get('brand_name')
        cpu_exact_query = request.GET.get('cpu_exact')

Example #12

0

Show file

File: search.py Project: shekharbiswas/ScrapeAmazon

import selectorlib
from selectorlib import Extractor
import requests
import json
from time import sleep
from fake_useragent import UserAgent

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('scrape_keyboard/search.yml')


def scrape(url):
    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)

Example #13

0

Show file

File: amazon_scraper.py Project: deepankarkotnala/PlayWithPython

from selectorlib import Extractor
import requests
import json
import argparse

argparser = argparse.ArgumentParser()
argparser.add_argument('url', help='Amazon Product Details URL')

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('TribitSpeaker.txt')

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'
headers = {'User-Agent': user_agent}

# Download the page using requests
args = argparser.parse_args()
r = requests.get(args.url, headers=headers)

# Pass the HTML of the page and create
data = e.extract(r.text)

# Print the data
#print(json.dumps(data, indent=True))

for item, detail in data.items():
    print(item, ":", detail)

#price = int(str(data['price'])[2:-3].replace(",", ""))
#print(price)

#if price < 8000:

Example #14

0

Show file

File: amazon.py Project: vinishs59/MyPriceComparsion

from selectorlib import Extractor
import requests
import json
from json2html import *
from time import sleep
import streamlit as st
import pandas as pd

import streamlit_theme as stt

stt.set_theme({'primary': '#1b3388'})
st.title('Smart Price app')

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('Testing.yml')
amazon_ext = Extractor.from_yaml_file('search_results.yml')
#pd.set_option('display.max_colwidth',None)


def scrape(url, str):

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',

Example #15

0

Show file

File: ctrls.py Project: jgmc3012/threepersonteam_bots

 def extractor_update(self):
     if not self._extractor_update_:
         self._extractor_update_ = Extractor.from_yaml_file(
             self.path_update)
     return self._extractor_update_

Example #16

0

Show file

File: results.py Project: jay380/Sdp

def scrape():
    e1 = Extractor.from_yaml_file('search_result.yml')
    e2 = Extractor.from_yaml_file('flip_results.yml')
    text = request.form.get("search_bar")
    print(text)
    url1 = "https://www.amazon.in/s?k={0}&rh=n%3A1375424031&ref=nb_sb_noss".format(
        text)
    url2 = "https://www.flipkart.com/search?q={0}&sid=6bo%2Cb5g&as=on&as-show=on&otracker=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&otracker1=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=macbook+pro%7CLaptops&requestId=4b1460e8-fcf5-4369-a655-a2501be025a8&as-backfill=on".format(
        text)
    r1 = requests.get(url1, headers=headers1)
    r2 = requests.get(url2, headers=headers2)
    sleep(2)
    data1 = e1.extract(r1.text)
    data2 = e2.extract(r2.text)
    product_title1 = []
    product_price1 = []
    product_img1 = []
    product_url1 = []
    product_title2 = []
    product_price2 = []
    product_img2 = []
    product_url2 = []
    i = 0

    for product1 in data1.values():
        for item1 in product1:
            product_title1.append(item1['title'])
            product_price1.append(item1['price'])
            product_img1.append(item1['image'])
            new_url1 = 'https://www.amazon.in' + item1['url']
            product_url1.append(new_url1)

    asyncio.set_event_loop(asyncio.SelectorEventLoop())
    data3 = asyncio.get_event_loop().run_until_complete(
        add_images_urls(data2, url2))
    # data3 = asyncio.run(add_images_urls(data2, url2))
    # data3 = await add_images_urls(data2, url2)
    # data3 = loop.run_until_complete(add_images_urls(data2, url2))
    # data3 = add_images_urls(data2, url2)
    for product2 in data3.values():
        for item2 in product2:
            product_title2.append(item2['title'])
            product_price2.append(item2['price'])
            product_img2.append(item2['image'])
            product_url2.append(item2['url'])
            # new_url2 = 'https://www.flipkart.com' + item2['url']
            # product_url2.append(new_url2)

    # session = HTMLSession()
    # response = session.get(url2)
    # response.html.render(sleep=1, scrolldown=20)
    # # Container for each product being displayed
    # div = response.html.find('._1UoZlX')
    # for image in div:
    #     img = image.find('img', first=True)
    #     img_src = img.attrs['src']
    #     product_img2.append(img_src)

    return render_template("index2.html",
                           title1=product_title1,
                           price1=product_price1,
                           img1=product_img1,
                           url1=product_url1,
                           title2=product_title2,
                           price2=product_price2,
                           img2=product_img2,
                           url2=product_url2)

Example #17

0

Show file

File: search.py Project: DeepakMishraDA/Web-scraping

import selectorlib
from selectorlib import Extractor
import requests
import json
from time import sleep
from fake_useragent import UserAgent

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('C:/Users/deepa/Desktop/Beginning/search.yml')


def scrape(url):
    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)

Example #18

0

Show file

import fake_useragent
from selectorlib import Extractor
import requests
from fake_useragent import UserAgent

import json
from time import sleep

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('scrape_mouse/products.yml')

def scrape(url):

    ua = UserAgent()


    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s"%url)

Example #19

0

Show file

File: amazon.py Project: aleague888/azn-request

from selectorlib import Extractor
import requests
import json
from time import sleep

# Create an Extractor by reading from the YAML file
#e = Extractor.from_yaml_file('selectors.yml')
e = Extractor.from_yaml_file('feynman.yml')


def scrape(url):
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept':
        'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }
    # Download page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)
    # Quick check to see if page was blocked (usually 503)
    if r.status_code > 500:

Example #20

0

Show file

File: views.py Project: KevalPadsumbiya/Minor-Project-1

def model(request):
    fhandle = open('items.txt')
    text = fhandle.read()
    fhandle.close()
    li = text.split(',')
    d = dict()
    d["data"] = li
    # print(request.GET.get('model'))
    l = []
    # result = []
    # print(request.GET.get('model'))
    model = request.GET.get('model')
    queryset = str(deviceDetails.objects.get(pk=model))
    # print("Queryset")
    # print(queryset)
    l = queryset.split('|||')[3]
    # result.append(queryset.split('---')[1])
    # result.append(l.split(','))
    # result.append(queryset.split('---')[3])
    # result = zip(queryset.split('---')[1],l.split(','),queryset.split('---')[3])

    data = Comments.objects.filter(
        mobile=get_object_or_404(deviceDetails, pk=model))
    names = []
    dates = []
    comment_text = []
    p_key = []
    vote_count = []
    up_voted = []
    down_voted = []
    delete_right = []

    for row in data:
        # print(row.pk)
        p_key.append(row.pk)
        temp = str(row).split('---')
        # print(temp)
        names.append(temp[-3])
        dates.append(temp[-1])
        comment_text.append(temp[-2].split("||||"))
        if int(temp[-4]) == 0:
            vote_count.append(0)
        elif int(temp[-4]) < 0:
            vote_count.append(temp[-4])
        else:
            vote_count.append('+' + temp[-4])

        if request.session.get('user_name', 0) != 0:
            if (temp[-3] == request.session.get('user_name', 0)):
                delete_right.append(1)
            else:
                delete_right.append(0)
            OB = Votes.objects.filter(username=get_object_or_404(
                UserData, user_name=request.session['user_name']),
                                      comment=get_object_or_404(Comments,
                                                                pk=row.pk))
            if len(OB):
                # there will be always one row is he/she has already votes
                for tuple in OB:
                    temp1 = str(tuple).split('---')
                    if temp1[-2] == '1':
                        up_voted.append(1)
                        down_voted.append(0)
                    else:
                        down_voted.append(1)
                        up_voted.append(0)
            else:
                up_voted.append(0)
                down_voted.append(0)
        else:
            up_voted.append(0)
            down_voted.append(0)
            delete_right.append(0)

    result = zip(names, dates, comment_text, p_key, vote_count, up_voted,
                 down_voted, delete_right)

    mobile_name = queryset.split('|||')[1]
    # print(mobile_name)
    url = "http://flipkart.com/search?q=" + '%20'.join(mobile_name.split())
    # print(url)
    data = requests.get(url).text
    soup = BeautifulSoup(data, 'lxml')

    rom = [
        ', 8 GB', ', 16 GB', ', 32 GB', ', 64 GB', ', 128 GB', ', 256 GB',
        ', 512 GB', ', 1024 GB', ', 2048 GB'
    ]
    varient = []
    price = []
    flipkart_url = []
    status = []
    stars = []
    ratings = []
    reviews = []
    for item in soup.find_all('a', class_="_1fQZEK"):
        name = item.find('div', class_="_4rR01T").text
        temp = name
        if '(' in name:
            # print(name+'---')
            name = name[:name.index('(')]
            if name[:-1] == mobile_name:
                for gb in rom:
                    if gb in temp:
                        rs = item.find('div', class_="_30jeq3").text
                        current_status = "available"
                        try:
                            current_status = item.find('div',
                                                       class_="_3G6awp").text
                        except:
                            pass
                        varient.append(temp)
                        price.append(rs[1:])
                        star = item.find('div', class_="_3LWZlK").text
                        stars.append(star)
                        # print(item.find('div',class_="_3LWZlK").text)
                        rating = item.find('span',
                                           class_="_2_R_DZ").span.span.text
                        ratings.append(rating)
                        print(rating)
                        text = str(item.find('span', class_="_2_R_DZ").span)
                        # print(item.find('span',class_="_2_R_DZ").span)
                        i = text.find('Reviews') - 2
                        review = ""
                        while text[i] != '>':
                            review += text[i]
                            i -= 1
                        review = review[::-1] + ' Reviews'
                        review = review.strip()
                        reviews.append(review)
                        # print(reviews)
                        status.append(current_status)
                        flipkart_url.append("https://www.flipkart.com" +
                                            item['href'])
                        # print("https://www.flipkart.com"+item['href'])
                        # print(temp,gb,rs[1:],status,"https://www.flipkart.com"+item['href'])
                        break
    # for name,pr,st,link in zip(varient,price,status,flipkart_url):
    #     print(name+" - "+pr+' - '+st+' - '+link)

    result1 = zip(varient, price, status, flipkart_url, stars, ratings,
                  reviews)

    f = open("home/temp.txt", "r")
    data = f.readlines()

    e = Extractor.from_yaml_file('home/Amazon_selector.yml')
    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    prefix = 'https://www.amazon.in/s?k='
    device_name = mobile_name
    suffix = '&rh=n%3A1805560031&ref=nb_sb_noss'

    url = prefix + '+'.join(device_name.split()) + suffix

    print(url)

    r = requests.get(url, headers=headers)
    data1 = e.extract(r.text)

    # print(data1)

    while data1['result'] is None:
        r = requests.get(url, headers=headers)
        data1 = e.extract(r.text)

    temp = data1['result']

    # print(temp)

    amazon_names = []
    amazon_prices = []
    amazon_ratings = []
    amazon_totalratings = []
    amazon_urls = []
    for device in temp:
        try:
            if device_name in device['name'] and 'Case' not in device[
                    'name'] and 'case' not in device['name']:
                print(device['name'] + ' - ' + device['price'][1:] +
                      ' - https://amazon.in' + device['url'] + ' - ' +
                      device['rating'] + ' - ' + device['total_ratings'])
                amazon_names.append(device['name'])
                amazon_prices.append(device['price'][1:])
                amazon_ratings.append(device['rating'])
                amazon_totalratings.append(device['total_ratings'])
                amazon_urls.append('https://amazon.in' + device['url'])
        except:
            pass

    result2 = zip(amazon_names, amazon_prices, amazon_ratings,
                  amazon_totalratings, amazon_urls)
    # print(amazon_names)
    # print(amazon_prices)
    # print(amazon_ratings)
    # print(amazon_totalratings)
    # print(amazon_urls)

    if request.session.get('user_name', 0) != 0:
        username = get_object_or_404(UserData,
                                     user_name=request.session['user_name'])
        email_verified = username.email_verified
        return render(
            request, "home/view.html", {
                'title': 'Price Comparator | ' + device_name,
                'email_verified': email_verified,
                'list': dumps(d),
                'Amazon_result': result2,
                'Flipkart_result': result1,
                'count': len(names),
                'result': result,
                'pk': model,
                'login_flag': True,
                'user_name': request.session['user_name'],
                'name': queryset.split('|||')[1],
                'image_url': queryset.split('|||')[2],
                'spec': l.split('---')
            })
    else:
        return render(
            request, "home/view.html", {
                'title': 'Price Comparator | ' + device_name,
                'login_flag': False,
                'list': dumps(d),
                'Amazon_result': result2,
                'Flipkart_result': result1,
                'count': len(names),
                'result': result,
                'pk': model,
                'name': queryset.split('|||')[1],
                'image_url': queryset.split('|||')[2][:-1],
                'spec': l.split('---')
            })

Example #21

0

Show file

import selectorlib
from selectorlib import Extractor
import requests
import json
from time import sleep
from fake_useragent import UserAgent


# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('search.yml')


def scrape(url):
    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)

Example #22

0

Show file

import sys, os

C = os.path.abspath(os.path.dirname(__file__))

from selectorlib import Extractor
import requests 
from time import sleep
import csv
from datetime import datetime, timedelta
from pprint import pprint

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file(os.path.join(C, 'booking.yml'))

def scrape(url):    
    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'DNT': '1',
        'Upgrade-Insecure-Requests': '1',
        # You may want to change the user agent if you get blocked
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

        'Referer': 'https://www.booking.com/index.en-gb.html',
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s"%url)

Example #23

0

Show file

File: scrape.py Project: inimist/scraper

    "Features",
    "Type",
    "URL",
    "URL Stays",
    "Beds",
    "Bedrooms",
    "Sleeps",
    "Map",
    "Description",
    "ID",
    "Best For",
    "Category"
]

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('properties.yml')
p = Extractor.from_yaml_file('property.yml')

def scrape(url):
    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'DNT': '1',
        'Upgrade-Insecure-Requests': '1',
        # You may want to change the user agent if you get blocked
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

        'Referer': 'https://www.booking.com/index.en-gb.html',
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',

Example #24

0

Show file

def product(name):
    url = "https://www.amazon.in/s?k=" + name
    e = Extractor.from_yaml_file('search.yml')
    r = requests.get(url, headers=headers)
    data = e.extract(r.text)
    return data

Example #25

0

Show file

File: main.py Project: cesarirnan/uoc-TCVD-PRA1

from time import sleep

import pandas as pd
import requests
import json
import re

from pandas import json_normalize
from selectorlib import Extractor
from timeit import default_timer as timer
from datetime import timedelta
from datetime import datetime

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('venv/data/search_results.yml')


def scrape(url):
    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',

Example #26

0

Show file

File: ctrls.py Project: jgmc3012/threepersonteam_bots

 def extractor_skus(self):
     if not self._extractor_skus_:
         self._extractor_skus_ = Extractor.from_yaml_file(self.path_skus)
     return self._extractor_skus_

Example #27

0

Show file

from selectorlib import Extractor
import requests
import json
from time import sleep

e = Extractor.from_yaml_file("selectors.yml")


def scrape(url):
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)
    # Simle check to check if page was blocked
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:

Example #28

0

Show file

File: ctrls.py Project: jgmc3012/threepersonteam_bots

 def extractor_new(self):
     if not self._extractor_new_:
         self._extractor_new_ = Extractor.from_yaml_file(self.path_new)
     return self._extractor_new_

Example #29

0

Show file

        else:
            print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
        return None
    # Pass the HTML of the page and create
    return e.extract(r.text)

def write_to_file():
    with open("search_results_urls.txt",'r') as urllist, open('search_results_output.json','w') as outfile:
        count_products = 0
        individual_products = 1
        for url in urllist.read().splitlines():
            data = scrape(url)
            if data:
                count_products += len(data['products'])
                for product in data['products']:
                    product['search_url'] = url
                    if individual_products <= product_count:
                        print("Saving Product: %s"%product['title'].encode('utf8'))
                        json.dump(product,outfile)
                        outfile.write("\n")
                        # sleep(5)
                    else:
                        print("We are done taking {} number of products".format(individual_products))
                        return
                    individual_products += 1
# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('search_results.yml')

# Write to output files
write_to_file()

Example #30

0

Show file

File: main.py Project: aobeirne20/RTXer

    'cache-control': 'no-cache',
    'dnt': '1',
    'upgrade-insecure-requests': '1',
    'user-agent':
    'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
    'accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}

#Nvidia URLs
nvidia_url = 'https://www.nvidia.com/en-us/geforce/graphics-cards/30-series/rtx-3080/'
nvidia_yml = Extractor.from_yaml_file('NvidiaChecker.yml')

#Newegg URLs
newegg_url = 'https://www.newegg.com/p/pl?d=rtx+3080&N=100007709&isdeptsrh=1'
newegg_yml = Extractor.from_yaml_file('NeweggChecker.yml')

#BestBuy URLS
bestbuy_url = 'https://www.bestbuy.com/site/nvidia-geforce-rtx-3080-10gb-gddr6x-pci-express-4-0-graphics-card-titanium-and-black/6429440.p?skuId=6429440'
bestbuy_yml = Extractor.from_yaml_file('BestBuyChecker.yml')


def nvidia_html_check():
    nvidia_html = requests.get(nvidia_url, headers=headers)
    t = 1
    if nvidia_html.status_code != 200:
        while nvidia_html.status_code != 200: