class ScraperApiProxyMiddleware(object): def __init__(self, settings): if not settings.getbool('SCRAPERAPI_ENABLED', True): raise NotConfigured self.SCRAPERAPI_KEY = settings.get('SCRAPERAPI_KEY', '') self.SCRAPERAPI_RENDER = settings.get('SCRAPERAPI_RENDER', False) self.SCRAPERAPI_PREMIUM = settings.get('SCRAPERAPI_PREMIUM', False) self.SCRAPERAPI_COUNTRY_CODE = settings.get('SCRAPERAPI_COUNTRY_CODE', '') self.SCRAPERAPI_CLIENT = None try: self.SCRAPERAPI_CLIENT = ScraperAPIClient(self.SCRAPERAPI_KEY) except: raise NotConfigured @classmethod def from_crawler(cls, crawler): o = cls(crawler.settings) return o def process_request(self, request, spider): if 'api.scraperapi.com' not in request.url: log.info("Process request...") new_url = self.SCRAPERAPI_CLIENT.scrapyGet( url=request.url, render=self.SCRAPERAPI_RENDER, country_code=self.SCRAPERAPI_COUNTRY_CODE, premium=self.SCRAPERAPI_PREMIUM) log.info("New url: {}".format(new_url)) return request.replace(url=new_url) return
def start_requests(self): secret = self.properties.properties['Scraper_secret'] print("Properties:", self.properties.properties) scrape_url = "Scraper_Vomar_Scrape_url" client = ScraperAPIClient(secret) for i in self.properties.properties.keys(): if str(i).startswith(scrape_url): url = self.properties.properties[i] property_values = self.properties.properties[i].split(";") priority = i.split("_")[4] try: category = property_values[1] except IndexError as e: category = property_values[0].split("/")[-1].replace( "?sort=relevancy+asc", "") print("Category:", category) print("Calling yield function for url: ", url) yield scrapy.Request(client.scrapyGet(url=url), self.parse, dont_filter=True, priority=int(priority), meta={'category': category})
def start_requests(self): secret = self.properties.properties['Scraper_secret'] scrape_url = "Scraper_Deen_scrape_url_1" client = ScraperAPIClient(secret) url = self.properties.properties[scrape_url] + '?items=6000' print("URl is:", url) yield scrapy.Request(client.scrapyGet(url=url), self.parse, dont_filter=True)
def main(): parser = argparse.ArgumentParser(description='Parses command line arguments') parser.add_argument('--scraper_api_key', type=str, required=True) args = parser.parse_args() client = ScraperAPIClient(args.scraper_api_key) result = json.loads(client.get(url='http://httpbin.org/ip').text) print('Rotated proxy IP address = ' + result['origin']) urls = [ client.scrapyGet(url='http://quotes.toscrape.com/page/1/'), client.scrapyGet(url='http://quotes.toscrape.com/page/2/'), ] for url in urls: r = requests.get(url) # add parsing logic here print(r.status_code)
def start_requests(self): secret = self.properties.properties['Scraper_secret'] print("Properties:", self.properties.properties) scrape_url = "Scraper_Dirk_Scrape_url" client = ScraperAPIClient(secret) for i in self.properties.properties.keys(): if str(i).startswith(scrape_url): url = self.properties.properties[i] property_value = self.properties.properties[i].split("/") priority = i.split("_")[4] category = property_value[4] print("Calling yield function for url: ", url) yield scrapy.Request(client.scrapyGet(url=url), self.parse, dont_filter=True, priority=int(priority), meta={'category': category})
def start_requests(self): secret = self.properties.properties['Scraper_secret'] print("Properties:", self.properties.properties) scrape_url = "Scraper_AH_scrape_url" client = ScraperAPIClient(secret) start_urls = list() priorities = list() for i in self.properties.properties.keys(): if str(i).startswith(scrape_url): start_urls.append(self.properties.properties[i]) priorities.append(i.split("_")[4]) print("Key start urls are:", start_urls, "Priorities:", priorities) urls = set() for priority, url in zip(priorities, start_urls): urls.add(url + '?page=' + str(100)) print("Calling yield function for url: ", url + '?page=' + str(100)) yield scrapy.Request( client.scrapyGet(url=url + '?page=' + str(100)), self.parse, dont_filter=True, priority=int(priority))
def start_requests(self): secret = self.properties.properties['Scraper_secret'] print("Properties:", self.properties.properties) scrape_url = "Scraper_Coop_scrape_url" client = ScraperAPIClient(secret) start_urls = list() priorities = list() categories = list() for i in self.properties.properties.keys(): if str(i).startswith(scrape_url): property_value = self.properties.properties[i].split(";") start_urls.append(property_value[0]) priorities.append(i.split("_")[4]) categories.append(property_value[1]) # print("Key start urls are:", start_urls, "Priorities:", priorities) for priority, url, category in zip(priorities, start_urls, categories): print("Calling yield function for url: ", url) yield scrapy.Request(client.scrapyGet(url=url), self.parse, dont_filter=True, priority=int(priority), meta={'category': category})
from selenium import webdriver from time import sleep import time from random import randrange from bs4 import BeautifulSoup from selenium.webdriver.common.keys import Keys from scraper_api import ScraperAPIClient client = ScraperAPIClient('********************') result = client.get(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True).text print("starting getting information from the web ...") start_urls = [client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True)] def parse(self, response): yield scrapy.Request(client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True), self.parse) soup = BeautifulSoup(result, 'html.parser') Adresse_text = [] Adresse_source = soup.find('div', attrs={'class': 'section-review-subtitle section-review-subtitle-nowrap'})