コード例 #1
0
class ScraperApiProxyMiddleware(object):
    def __init__(self, settings):
        if not settings.getbool('SCRAPERAPI_ENABLED', True):
            raise NotConfigured

        self.SCRAPERAPI_KEY = settings.get('SCRAPERAPI_KEY', '')
        self.SCRAPERAPI_RENDER = settings.get('SCRAPERAPI_RENDER', False)
        self.SCRAPERAPI_PREMIUM = settings.get('SCRAPERAPI_PREMIUM', False)
        self.SCRAPERAPI_COUNTRY_CODE = settings.get('SCRAPERAPI_COUNTRY_CODE',
                                                    '')

        self.SCRAPERAPI_CLIENT = None
        try:
            self.SCRAPERAPI_CLIENT = ScraperAPIClient(self.SCRAPERAPI_KEY)
        except:
            raise NotConfigured

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(crawler.settings)
        return o

    def process_request(self, request, spider):
        if 'api.scraperapi.com' not in request.url:
            log.info("Process request...")
            new_url = self.SCRAPERAPI_CLIENT.scrapyGet(
                url=request.url,
                render=self.SCRAPERAPI_RENDER,
                country_code=self.SCRAPERAPI_COUNTRY_CODE,
                premium=self.SCRAPERAPI_PREMIUM)

            log.info("New url: {}".format(new_url))
            return request.replace(url=new_url)
        return
コード例 #2
0
    def start_requests(self):
        secret = self.properties.properties['Scraper_secret']
        print("Properties:", self.properties.properties)
        scrape_url = "Scraper_Vomar_Scrape_url"
        client = ScraperAPIClient(secret)

        for i in self.properties.properties.keys():
            if str(i).startswith(scrape_url):
                url = self.properties.properties[i]
                property_values = self.properties.properties[i].split(";")
                priority = i.split("_")[4]
                try:
                    category = property_values[1]
                except IndexError as e:
                    category = property_values[0].split("/")[-1].replace(
                        "?sort=relevancy+asc", "")

                print("Category:", category)

                print("Calling yield function for url: ", url)
                yield scrapy.Request(client.scrapyGet(url=url),
                                     self.parse,
                                     dont_filter=True,
                                     priority=int(priority),
                                     meta={'category': category})
コード例 #3
0
 def start_requests(self):
     secret = self.properties.properties['Scraper_secret']
     scrape_url = "Scraper_Deen_scrape_url_1"
     client = ScraperAPIClient(secret)
     url = self.properties.properties[scrape_url] + '?items=6000'
     print("URl is:", url)
     yield scrapy.Request(client.scrapyGet(url=url),
                          self.parse,
                          dont_filter=True)
コード例 #4
0
def main():
	parser = argparse.ArgumentParser(description='Parses command line arguments')
	parser.add_argument('--scraper_api_key', type=str, required=True)
	args = parser.parse_args()

	client = ScraperAPIClient(args.scraper_api_key)
	result = json.loads(client.get(url='http://httpbin.org/ip').text)
	print('Rotated proxy IP address = ' + result['origin'])

	urls = [
		client.scrapyGet(url='http://quotes.toscrape.com/page/1/'),
		client.scrapyGet(url='http://quotes.toscrape.com/page/2/'),
	]

	for url in urls:
		r = requests.get(url)
		# add parsing logic here
		print(r.status_code)
コード例 #5
0
    def start_requests(self):
        secret = self.properties.properties['Scraper_secret']
        print("Properties:", self.properties.properties)
        scrape_url = "Scraper_Dirk_Scrape_url"
        client = ScraperAPIClient(secret)

        for i in self.properties.properties.keys():
            if str(i).startswith(scrape_url):
                url = self.properties.properties[i]
                property_value = self.properties.properties[i].split("/")
                priority = i.split("_")[4]
                category = property_value[4]
                print("Calling yield function for url: ", url)
                yield scrapy.Request(client.scrapyGet(url=url),
                                     self.parse,
                                     dont_filter=True,
                                     priority=int(priority),
                                     meta={'category': category})
コード例 #6
0
    def start_requests(self):
        secret = self.properties.properties['Scraper_secret']
        print("Properties:", self.properties.properties)
        scrape_url = "Scraper_AH_scrape_url"
        client = ScraperAPIClient(secret)

        start_urls = list()
        priorities = list()
        for i in self.properties.properties.keys():
            if str(i).startswith(scrape_url):
                start_urls.append(self.properties.properties[i])
                priorities.append(i.split("_")[4])
        print("Key start urls are:", start_urls, "Priorities:", priorities)
        urls = set()
        for priority, url in zip(priorities, start_urls):
            urls.add(url + '?page=' + str(100))
            print("Calling yield function for url: ", url + '?page=' + str(100))
            yield scrapy.Request(
                client.scrapyGet(url=url + '?page=' + str(100)),
                self.parse, dont_filter=True, priority=int(priority))
コード例 #7
0
    def start_requests(self):
        secret = self.properties.properties['Scraper_secret']
        print("Properties:", self.properties.properties)
        scrape_url = "Scraper_Coop_scrape_url"
        client = ScraperAPIClient(secret)

        start_urls = list()
        priorities = list()
        categories = list()
        for i in self.properties.properties.keys():
            if str(i).startswith(scrape_url):
                property_value = self.properties.properties[i].split(";")
                start_urls.append(property_value[0])
                priorities.append(i.split("_")[4])
                categories.append(property_value[1])
        # print("Key start urls are:", start_urls, "Priorities:", priorities)

        for priority, url, category in zip(priorities, start_urls, categories):
            print("Calling yield function for url: ", url)
            yield scrapy.Request(client.scrapyGet(url=url),
                                 self.parse,
                                 dont_filter=True,
                                 priority=int(priority),
                                 meta={'category': category})
コード例 #8
0
from selenium import webdriver
from time import sleep
import time
from random import randrange
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from scraper_api import ScraperAPIClient



client = ScraperAPIClient('********************')
result = client.get(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True).text
print("starting getting information from the web ...")  


start_urls = [client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True)]
def parse(self, response):

    yield scrapy.Request(client.scrapyGet(url = 'https://www.google.com/maps/contrib/113811103574182320069/reviews/@46.3276522,3.1595249,6z/data=!3m1!4b1!4m3!8m2!3m1!1e1', render=True), self.parse)


soup = BeautifulSoup(result, 'html.parser')




    
Adresse_text = []


Adresse_source = soup.find('div', attrs={'class': 'section-review-subtitle section-review-subtitle-nowrap'})