Python Extractor.from_yaml_fileの例、selectorlib.Extractor.from_yaml_file Pythonの例

コード例 #1

0

ファイルを表示

import fake_useragent
from selectorlib import Extractor
import requests
from fake_useragent import UserAgent

import json
from time import sleep

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('scrape_mouse/products.yml')

def scrape(url):

    ua = UserAgent()


    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s"%url)

コード例 #2

0

ファイルを表示

ファイル: search.py プロジェクト: shekharbiswas/ScrapeAmazon

import selectorlib
from selectorlib import Extractor
import requests
import json
from time import sleep
from fake_useragent import UserAgent


# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('search.yml')


def scrape(url):
    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)

コード例 #3

0

ファイルを表示

import requests
from selectorlib import Extractor

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file("booking.yml")


def create_url(people, place, datein, dateout, offset):

    url_list = []

    for i in range(1, offset, 25):

        url = (
            "https://www.booking.com/searchresults.en-gb.html?checkin_month={in_month}"
            "&checkin_monthday={in_day}&checkin_year={in_year}&checkout_month={out_month}"
            "&checkout_monthday={out_day}&checkout_year={out_year}&group_adults={people}"
            "&group_children=0&order=review_score_and_price&ss={place}&offset={offset}"
            .format(
                in_month=str(datein.month),
                in_day=str(datein.day),
                in_year=str(datein.year),
                out_month=str(dateout.month),
                out_day=str(dateout.day),
                out_year=str(dateout.year),
                people=people,
                place=place,
                offset=i,
            ))

        url_list.append(url)

コード例 #4

0

ファイルを表示

def format_response(response):
    formatters = Formatter.get_all()
    extractor = Extractor.from_yaml_file('./scrapper.yaml',
                                         formatters=formatters)
    data = extractor.extract(response.text)
    return data

コード例 #5

0

ファイルを表示

from selectorlib import Extractor
import requests 
import json 
from time import sleep


# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('selectors.yml')

def scrape(url):  

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    # print("Downloading %s"%url)
    r = requests.get(url, headers=headers)
    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)

コード例 #6

0

ファイルを表示

def scrape(url):
    e = Extractor.from_yaml_file("selectors.yml")
    ua = UserAgent()

    prod_id_regex = re.compile(r".*/([a-zA-Z0-9]{10})(?:[/?]|$).*")
    product_id = prod_id_regex.match(url).group(1)

    headers = {
        "authority": "www.amazon.com",
        "pragma": "no-cache",
        "cache-control": "no-cache",
        "dnt": "1",
        "upgrade-insecure-requests": "1",
        "user-agent": ua.random,
        "accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "sec-fetch-site": "none",
        "sec-fetch-mode": "navigate",
        "sec-fetch-dest": "document",
        "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
    }

    r = requests.get(url, headers=headers)
    retries = 0
    while "captcha" in r.text and retries < 10:
        print("user agent failed, trying new one")
        headers["user-agent"] = ua.random
        r = requests.get(url, headers=headers)
        retries += 1

    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print(
                "Page %s was blocked by Amazon. Please try using better proxies\n"
                % url)
        else:
            print(
                "Page %s must have been blocked by Amazon as the status code was %d"
                % (url, r.status_code))
        return

    # Pass the HTML of the page and create
    data = e.extract(r.text)

    category = data["product_category"]
    images = data["product_images"][1:-1].split("],")
    images = [x.split(":[")[0][1:-1] for x in images]
    out_reviews = []

    for review in data["reviews"]:
        r = {}
        r["rating"] = float(review["rating"][:3])
        r["product_category"] = category
        r["verified"] = "N" if review["verified"] is None else "Y"
        r["review_text"] = review["content"]
        out_reviews.append(r)

    out_data = {}
    out_data["title"] = data["product_title"]
    out_data["id"] = product_id
    out_data["price"] = data["product_price"]
    out_data["image"] = images[-1]

    return out_reviews, out_data

コード例 #7

0

ファイルを表示

ファイル: searchresults.py プロジェクト: bmwwebdesign/ama2

from selectorlib import Extractor
import requests
import json
from time import sleep

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('search_results.yml')


def scrape(url):

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)
    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:

コード例 #8

0

ファイルを表示

ファイル: ctrls.py プロジェクト: jgmc3012/threepersonteam_bots

 def extractor_update(self):
     if not self._extractor_update_:
         self._extractor_update_ = Extractor.from_yaml_file(
             self.path_update)
     return self._extractor_update_

コード例 #9

0

ファイルを表示

ファイル: search.py プロジェクト: DeepakMishraDA/Web-scraping

import selectorlib
from selectorlib import Extractor
import requests
import json
from time import sleep
from fake_useragent import UserAgent

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('C:/Users/deepa/Desktop/Beginning/search.yml')


def scrape(url):
    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)

コード例 #10

0

ファイルを表示

ファイル: ctrls.py プロジェクト: jgmc3012/threepersonteam_bots

 def extractor_skus(self):
     if not self._extractor_skus_:
         self._extractor_skus_ = Extractor.from_yaml_file(self.path_skus)
     return self._extractor_skus_

コード例 #11

0

ファイルを表示

ファイル: ctrls.py プロジェクト: jgmc3012/threepersonteam_bots

 def extractor_new(self):
     if not self._extractor_new_:
         self._extractor_new_ = Extractor.from_yaml_file(self.path_new)
     return self._extractor_new_

コード例 #12

0

ファイルを表示

def product(name):
    url = "https://www.amazon.in/s?k=" + name
    e = Extractor.from_yaml_file('search.yml')
    r = requests.get(url, headers=headers)
    data = e.extract(r.text)
    return data

コード例 #13

0

ファイルを表示

import sys, os

C = os.path.abspath(os.path.dirname(__file__))

from selectorlib import Extractor
import requests 
from time import sleep
import csv
from datetime import datetime, timedelta
from pprint import pprint

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file(os.path.join(C, 'booking.yml'))

def scrape(url):    
    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'DNT': '1',
        'Upgrade-Insecure-Requests': '1',
        # You may want to change the user agent if you get blocked
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

        'Referer': 'https://www.booking.com/index.en-gb.html',
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s"%url)

コード例 #14

0

ファイルを表示

ファイル: views.py プロジェクト: KevalPadsumbiya/Minor-Project-1

def model(request):
    fhandle = open('items.txt')
    text = fhandle.read()
    fhandle.close()
    li = text.split(',')
    d = dict()
    d["data"] = li
    # print(request.GET.get('model'))
    l = []
    # result = []
    # print(request.GET.get('model'))
    model = request.GET.get('model')
    queryset = str(deviceDetails.objects.get(pk=model))
    # print("Queryset")
    # print(queryset)
    l = queryset.split('|||')[3]
    # result.append(queryset.split('---')[1])
    # result.append(l.split(','))
    # result.append(queryset.split('---')[3])
    # result = zip(queryset.split('---')[1],l.split(','),queryset.split('---')[3])

    data = Comments.objects.filter(
        mobile=get_object_or_404(deviceDetails, pk=model))
    names = []
    dates = []
    comment_text = []
    p_key = []
    vote_count = []
    up_voted = []
    down_voted = []
    delete_right = []

    for row in data:
        # print(row.pk)
        p_key.append(row.pk)
        temp = str(row).split('---')
        # print(temp)
        names.append(temp[-3])
        dates.append(temp[-1])
        comment_text.append(temp[-2].split("||||"))
        if int(temp[-4]) == 0:
            vote_count.append(0)
        elif int(temp[-4]) < 0:
            vote_count.append(temp[-4])
        else:
            vote_count.append('+' + temp[-4])

        if request.session.get('user_name', 0) != 0:
            if (temp[-3] == request.session.get('user_name', 0)):
                delete_right.append(1)
            else:
                delete_right.append(0)
            OB = Votes.objects.filter(username=get_object_or_404(
                UserData, user_name=request.session['user_name']),
                                      comment=get_object_or_404(Comments,
                                                                pk=row.pk))
            if len(OB):
                # there will be always one row is he/she has already votes
                for tuple in OB:
                    temp1 = str(tuple).split('---')
                    if temp1[-2] == '1':
                        up_voted.append(1)
                        down_voted.append(0)
                    else:
                        down_voted.append(1)
                        up_voted.append(0)
            else:
                up_voted.append(0)
                down_voted.append(0)
        else:
            up_voted.append(0)
            down_voted.append(0)
            delete_right.append(0)

    result = zip(names, dates, comment_text, p_key, vote_count, up_voted,
                 down_voted, delete_right)

    mobile_name = queryset.split('|||')[1]
    # print(mobile_name)
    url = "http://flipkart.com/search?q=" + '%20'.join(mobile_name.split())
    # print(url)
    data = requests.get(url).text
    soup = BeautifulSoup(data, 'lxml')

    rom = [
        ', 8 GB', ', 16 GB', ', 32 GB', ', 64 GB', ', 128 GB', ', 256 GB',
        ', 512 GB', ', 1024 GB', ', 2048 GB'
    ]
    varient = []
    price = []
    flipkart_url = []
    status = []
    stars = []
    ratings = []
    reviews = []
    for item in soup.find_all('a', class_="_1fQZEK"):
        name = item.find('div', class_="_4rR01T").text
        temp = name
        if '(' in name:
            # print(name+'---')
            name = name[:name.index('(')]
            if name[:-1] == mobile_name:
                for gb in rom:
                    if gb in temp:
                        rs = item.find('div', class_="_30jeq3").text
                        current_status = "available"
                        try:
                            current_status = item.find('div',
                                                       class_="_3G6awp").text
                        except:
                            pass
                        varient.append(temp)
                        price.append(rs[1:])
                        star = item.find('div', class_="_3LWZlK").text
                        stars.append(star)
                        # print(item.find('div',class_="_3LWZlK").text)
                        rating = item.find('span',
                                           class_="_2_R_DZ").span.span.text
                        ratings.append(rating)
                        print(rating)
                        text = str(item.find('span', class_="_2_R_DZ").span)
                        # print(item.find('span',class_="_2_R_DZ").span)
                        i = text.find('Reviews') - 2
                        review = ""
                        while text[i] != '>':
                            review += text[i]
                            i -= 1
                        review = review[::-1] + ' Reviews'
                        review = review.strip()
                        reviews.append(review)
                        # print(reviews)
                        status.append(current_status)
                        flipkart_url.append("https://www.flipkart.com" +
                                            item['href'])
                        # print("https://www.flipkart.com"+item['href'])
                        # print(temp,gb,rs[1:],status,"https://www.flipkart.com"+item['href'])
                        break
    # for name,pr,st,link in zip(varient,price,status,flipkart_url):
    #     print(name+" - "+pr+' - '+st+' - '+link)

    result1 = zip(varient, price, status, flipkart_url, stars, ratings,
                  reviews)

    f = open("home/temp.txt", "r")
    data = f.readlines()

    e = Extractor.from_yaml_file('home/Amazon_selector.yml')
    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    prefix = 'https://www.amazon.in/s?k='
    device_name = mobile_name
    suffix = '&rh=n%3A1805560031&ref=nb_sb_noss'

    url = prefix + '+'.join(device_name.split()) + suffix

    print(url)

    r = requests.get(url, headers=headers)
    data1 = e.extract(r.text)

    # print(data1)

    while data1['result'] is None:
        r = requests.get(url, headers=headers)
        data1 = e.extract(r.text)

    temp = data1['result']

    # print(temp)

    amazon_names = []
    amazon_prices = []
    amazon_ratings = []
    amazon_totalratings = []
    amazon_urls = []
    for device in temp:
        try:
            if device_name in device['name'] and 'Case' not in device[
                    'name'] and 'case' not in device['name']:
                print(device['name'] + ' - ' + device['price'][1:] +
                      ' - https://amazon.in' + device['url'] + ' - ' +
                      device['rating'] + ' - ' + device['total_ratings'])
                amazon_names.append(device['name'])
                amazon_prices.append(device['price'][1:])
                amazon_ratings.append(device['rating'])
                amazon_totalratings.append(device['total_ratings'])
                amazon_urls.append('https://amazon.in' + device['url'])
        except:
            pass

    result2 = zip(amazon_names, amazon_prices, amazon_ratings,
                  amazon_totalratings, amazon_urls)
    # print(amazon_names)
    # print(amazon_prices)
    # print(amazon_ratings)
    # print(amazon_totalratings)
    # print(amazon_urls)

    if request.session.get('user_name', 0) != 0:
        username = get_object_or_404(UserData,
                                     user_name=request.session['user_name'])
        email_verified = username.email_verified
        return render(
            request, "home/view.html", {
                'title': 'Price Comparator | ' + device_name,
                'email_verified': email_verified,
                'list': dumps(d),
                'Amazon_result': result2,
                'Flipkart_result': result1,
                'count': len(names),
                'result': result,
                'pk': model,
                'login_flag': True,
                'user_name': request.session['user_name'],
                'name': queryset.split('|||')[1],
                'image_url': queryset.split('|||')[2],
                'spec': l.split('---')
            })
    else:
        return render(
            request, "home/view.html", {
                'title': 'Price Comparator | ' + device_name,
                'login_flag': False,
                'list': dumps(d),
                'Amazon_result': result2,
                'Flipkart_result': result1,
                'count': len(names),
                'result': result,
                'pk': model,
                'name': queryset.split('|||')[1],
                'image_url': queryset.split('|||')[2][:-1],
                'spec': l.split('---')
            })

コード例 #15

0

ファイルを表示

from selectorlib import Extractor
import requests
import json
from time import sleep

e = Extractor.from_yaml_file("selectors.yml")


def scrape(url):
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)
    # Simle check to check if page was blocked
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:

コード例 #16

0

ファイルを表示

ファイル: amazon.py プロジェクト: aleague888/azn-request

from selectorlib import Extractor
import requests
import json
from time import sleep

# Create an Extractor by reading from the YAML file
#e = Extractor.from_yaml_file('selectors.yml')
e = Extractor.from_yaml_file('feynman.yml')


def scrape(url):
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept':
        'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }
    # Download page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)
    # Quick check to see if page was blocked (usually 503)
    if r.status_code > 500:

コード例 #17

0

ファイルを表示

#! python3
import requests, bs4, pyperclip, openpyxl, mechanize

from selectorlib import Extractor

clipboard = str(pyperclip.paste())
clipboard_list = clipboard.splitlines()

# initialize simulated chrome browser
chrome = mechanize.Browser()
chrome.set_handle_robots(False)
chrome.addheaders = [(
    'User-agent',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'
)]

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('Amazon_Sellers.yml')

for i in range(len(clipboard_list)):
    asin = clipboard_list[i]
    asin_url = 'https://www.amazon.com/gp/offer-listing/' + asin + '/'
    # Yurl = 'http://api.scraperapi.com/?api_key=f0759471ae73755feab08ef9ac6e299d&url=' + asin_url

    res = chrome.open(asin_url)

    #asin_html = bs4.BeautifulSoup(res.text, 'html.parser')
    print(e.extract(res))

コード例 #18

0

ファイルを表示

import fake_useragent
from selectorlib import Extractor
import requests
from fake_useragent import UserAgent

import json
from time import sleep

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('products.yml')


def scrape(url):

    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests

コード例 #19

0

ファイルを表示

import selectorlib
from selectorlib import Extractor
import requests
import json
from time import sleep
from fake_useragent import UserAgent

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('scrape_smartphone/search.yml')


def scrape(url):
    ua = UserAgent()

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': ua.random,
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)

コード例 #20

0

ファイルを表示

ファイル: amazon.py プロジェクト: avitomar12/price_tracker

from selectorlib import Extractor
import requests
import json
import time
import csv
e = Extractor.from_yaml_file(
    '/home/avi/Documents/work/price_tracker/Tracker/selector.yml')


def scrape_(url):
    headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }
    print("Downloading  ", url)
    r = requests.get(url, headers=headers)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print(
                "Page %s was blocked by Amazon. Please try using better proxies\n"

コード例 #21

0

ファイルを表示

 def createExtractor(self):
     # Create an Extractor by reading from the YAML file
     e = Extractor.from_yaml_file('amazon-scraper/selectors.yml')
     return e

コード例 #22

0

ファイルを表示

ファイル: scrape.py プロジェクト: inimist/scraper

    "Features",
    "Type",
    "URL",
    "URL Stays",
    "Beds",
    "Bedrooms",
    "Sleeps",
    "Map",
    "Description",
    "ID",
    "Best For",
    "Category"
]

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('properties.yml')
p = Extractor.from_yaml_file('property.yml')

def scrape(url):
    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'DNT': '1',
        'Upgrade-Insecure-Requests': '1',
        # You may want to change the user agent if you get blocked
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

        'Referer': 'https://www.booking.com/index.en-gb.html',
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',

コード例 #23

0

ファイルを表示

ファイル: amazon_scraper.py プロジェクト: chjort/webshop_scraper

class AmazonScraper(WebshopScraper):
    name = "amazon_scraper"

    # Proxy
    API_KEY = "9244ba171bff5bb2139d5403c443ee87"
    scraper_proxy = "http://*****:*****@proxy-server.scraperapi.com:8001".format(
        API_KEY)

    # HTML extractor
    selector_file = 'webshop_scraper/amazon_selectors.yml'
    extractor = Extractor.from_yaml_file(selector_file)

    # HTML image regex
    image_pattern = re.compile(r'\'initial\': ' '(.*?)' r'\},\n', re.DOTALL)

    def __init__(self,
                 n_pages=None,
                 product_save_dir=None,
                 scraped_urls_file=None,
                 include_variants=True):
        super().__init__(n_pages=n_pages,
                         product_save_dir=product_save_dir,
                         scraped_urls_file=scraped_urls_file,
                         include_variants=include_variants)

    def get_start_urls(self):
        urls = [
            # "https://www.amazon.co.uk/s?rh=n%3A468292%2Cp_72%3A4-&pf_rd_i=468292&pf_rd_p=d40c144e-45ba-5915-b01d-d92bd82e9a59&pf_rd_r=9AHN48N59BT4GF71E1G8&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # toys
            # "https://www.amazon.co.uk/s?rh=n%3A117332031%2Cp_72%3A4-&pf_rd_i=117332031&pf_rd_p=4c8654cd-5980-5a4f-a532-3db1a3a6d579&pf_rd_r=AWW9M71158D9EAAAR8KB&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # beauty
            # "https://www.amazon.co.uk/s?i=sports&rh=n%3A461182031%2Cp_72%3A184323031&pf_rd_i=461182031&pf_rd_p=e9bb2e37-191c-532b-9180-73d951e30279&pf_rd_r=8R8CEN6NH34W6VM2610X&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # fit watches
            # "https://www.amazon.co.uk/s?rh=n%3A5866054031%2Cp_72%3A4-&pf_rd_i=5866054031&pf_rd_p=4ad30a04-262e-55f5-a315-4c86a63048cb&pf_rd_r=WTMSQNPE0Z6818GG7NMB&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # utilities / science
            # "https://www.amazon.com/s?i=specialty-aps&bbn=16225013011&rh=n%3A%2116225013011%2Cn%3A2975312011&ref=nav_em_0_2_14_2__nav_desktop_sa_intl_dogs", # dog
            # "https://www.amazon.co.uk/s?i=sports&rh=n%3A318949011%2Cp_72%3A184323031&pf_rd_i=318949011&pf_rd_p=b052e5ee-b3e8-5fa8-b467-a83cf0dcb513&pf_rd_r=JTXNEV8HM50Z5S18AEQ3&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE", # sports equipment
        ]

        return urls

    def get_product_pages(self, response):
        products = response.xpath(
            "//span[@class='rush-component' and @data-component-type='s-product-image']"
        ).xpath("a").xpath("@href").getall()
        products = ["/".join(url.split("/")[:4]) for url in products]
        products = [response.urljoin(url) for url in products]
        return products

    def get_product_info(self, response):
        html = response.text
        return self.extractor.extract(html)

    def get_product_image_urls(self, response):
        html = response.text
        try:
            data = self.image_pattern.search(html)
            images = data.group(1)
        except AttributeError:
            # no image carousel in page
            return []

        images = json.loads(images)
        images = [img["hiRes"] for img in images]
        return list(filter(None, images))

    def get_next_page_url(self, response):
        next_page_url = response.xpath(
            "//ul[@class='a-pagination']/li[@class='a-last']/a").xpath(
                "@href").get()
        next_page_url = response.urljoin(next_page_url)
        return next_page_url

コード例 #24

0

ファイルを表示

        "document",
        "Sec-Fetch-Mode":
        "navigate",
        "Sec-Fetch-Site":
        "none",
        "Sec-Fetch-User":
        "******",
        "Upgrade-Insecure-Requests":
        "1",
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36 OPR/71.0.3770.148"
    }
]

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('selectors_review.yml')


def scrape(url, counter):
    # If you want more you can loop through page numbers
    url = "https://www.amazon.com" + url.rstrip() + "&pageNumber=" + str(
        counter)
    headers = random.choice(headers_list)
    # Download the page using requests
    print("Downloading %s" % url)
    current_proxy = get_proxy()
    try:
        r = requests.get(url, headers=headers, proxies=current_proxy)
    except:
        print("Connection Refused")
        return None

コード例 #25

0

ファイルを表示

ファイル: functions.py プロジェクト: tahirs95/Web-Scraping-Portfolio

from selectorlib import Extractor
import requests
import re

e = Extractor.from_yaml_file('scraper_app/static/selectors/selectors.yml')


def scrape(url):

    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    r = requests.get(url, headers=headers)

    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print(
                "Page %s was blocked by Amazon. Please try using better proxies\n"
                % url)

コード例 #26

0

ファイルを表示

ファイル: main.py プロジェクト: cesarirnan/uoc-TCVD-PRA1

from time import sleep

import pandas as pd
import requests
import json
import re

from pandas import json_normalize
from selectorlib import Extractor
from timeit import default_timer as timer
from datetime import timedelta
from datetime import datetime

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('venv/data/search_results.yml')


def scrape(url):
    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',

コード例 #27

0

ファイルを表示

ファイル: scrape_product_data.py プロジェクト: kolbt/bot_the_builder

        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "******",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9"
    }
]

# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('selectors_product.yml')


def scrape(url):
    # If you want more you can loop through page numbers
    url = "https://www.amazon.com" + url.rstrip()
    headers = random.choice(headers_list)
    # Download the page using requests
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers)
    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print(
                "Page %s was blocked by Amazon. Please try using better proxies\n"
                % url)

コード例 #28

0

ファイルを表示

def amazonScrape(hashtag, key):
    tmp1 = []
    tmp3 = []

    # Create an Extractor by reading from the YAML file
    e = Extractor.from_yaml_file('scripts/search_results.yml')
    headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.in/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    }

    # Download the page using requests
    url = "https://www.amazon.in/s?k=" + hashtag
    print("Downloading %s" % url)
    r = requests.get(url, headers=headers, timeout=28)
    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print(
                "Page %s was blocked by Amazon. Please try using better proxies\n"
                % url)
        else:
            print(
                "Page %s must have been blocked by Amazon as the status code was %d"
                % (url, r.status_code))
        print(r.status_code)
        return tmp1, tmp3

    if e.extract(r.text) == None:
        print("None!")
        return tmp1, tmp3

    else:
        if e.extract(r.text)['products'] != None:
            productfeed = e.extract(r.text)['products']
            counter = 0
            for product in productfeed:
                if counter > 3: break
                if product['price'] == None: print("Price is NONE")
                elif product['price'] != None:
                    print(counter, "price NOT NONE")
                    price = int(float(product['price'][1:].replace(',', '')))
                    print(price)
                    if price >= 500:
                        product_url = "https://www.amazon.in" + product['url']
                        counter += 1
                        print(product_url)
                        tmp3.append(product_url)
                        tmp1.append(key)

    return tmp1, tmp3

コード例 #29

0

ファイルを表示

ファイル: bet_turtle.py プロジェクト: ramiz11/bet_turtle

from random import randint
from pathlib import Path
import time
from selectorlib import Extractor
import requests
import json
from playsound import playsound
from pygame import mixer

chosen_turtle = ""  # the turtle which the user bets on
bet_amount = 0  # the betting amount
guessed_steps = 0  # guessed number of steps of the winning turtle
finished_race_info = [
]  # info about the race : winning turtle, colour of winning turtle, and the number of steps it took
turtle_step_counter = [0, 0, 0, 0]  # create a list of steps of turtles
e = Extractor.from_yaml_file(
    'selectors.yml')  # Create an Extractor by reading from the YAML file
budget = 0
wins_and_losses = []  # a list of wins and losses of user for each round
turtle_colours = ['green', 'purple', 'blue', 'orange']
rounds_counter = 0  # counts the number of rounds played

#set up the screen width and height
wh = Screen()
wh.setup(width=1.0, height=1.0)
onscreenclick
#create an instance of turtle for writing messages
message = Turtle()
message.hideturtle()
message.speed('fastest')
message.color('black')
message.style = ('Courier', 30, 'italic')

コード例 #30

0

ファイルを表示

ファイル: results.py プロジェクト: jay380/Sdp

def scrape():
    e1 = Extractor.from_yaml_file('search_result.yml')
    e2 = Extractor.from_yaml_file('flip_results.yml')
    text = request.form.get("search_bar")
    print(text)
    url1 = "https://www.amazon.in/s?k={0}&rh=n%3A1375424031&ref=nb_sb_noss".format(
        text)
    url2 = "https://www.flipkart.com/search?q={0}&sid=6bo%2Cb5g&as=on&as-show=on&otracker=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&otracker1=AS_QueryStore_HistoryAutoSuggest_1_7_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=macbook+pro%7CLaptops&requestId=4b1460e8-fcf5-4369-a655-a2501be025a8&as-backfill=on".format(
        text)
    r1 = requests.get(url1, headers=headers1)
    r2 = requests.get(url2, headers=headers2)
    sleep(2)
    data1 = e1.extract(r1.text)
    data2 = e2.extract(r2.text)
    product_title1 = []
    product_price1 = []
    product_img1 = []
    product_url1 = []
    product_title2 = []
    product_price2 = []
    product_img2 = []
    product_url2 = []
    i = 0

    for product1 in data1.values():
        for item1 in product1:
            product_title1.append(item1['title'])
            product_price1.append(item1['price'])
            product_img1.append(item1['image'])
            new_url1 = 'https://www.amazon.in' + item1['url']
            product_url1.append(new_url1)

    asyncio.set_event_loop(asyncio.SelectorEventLoop())
    data3 = asyncio.get_event_loop().run_until_complete(
        add_images_urls(data2, url2))
    # data3 = asyncio.run(add_images_urls(data2, url2))
    # data3 = await add_images_urls(data2, url2)
    # data3 = loop.run_until_complete(add_images_urls(data2, url2))
    # data3 = add_images_urls(data2, url2)
    for product2 in data3.values():
        for item2 in product2:
            product_title2.append(item2['title'])
            product_price2.append(item2['price'])
            product_img2.append(item2['image'])
            product_url2.append(item2['url'])
            # new_url2 = 'https://www.flipkart.com' + item2['url']
            # product_url2.append(new_url2)

    # session = HTMLSession()
    # response = session.get(url2)
    # response.html.render(sleep=1, scrolldown=20)
    # # Container for each product being displayed
    # div = response.html.find('._1UoZlX')
    # for image in div:
    #     img = image.find('img', first=True)
    #     img_src = img.attrs['src']
    #     product_img2.append(img_src)

    return render_template("index2.html",
                           title1=product_title1,
                           price1=product_price1,
                           img1=product_img1,
                           url1=product_url1,
                           title2=product_title2,
                           price2=product_price2,
                           img2=product_img2,
                           url2=product_url2)