Ejemplo n.º 1
0
    def __init__(self, *args, **kwargs):
        super(GoogleSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self._browsers = []

        browser_profiles = ({
            'proxy': '77.75.105.70:22955',
            'proxy-type': 'http',
            'proxy-auth': 'pp-dampssno:poekensi'
        }, {
            'proxy': '80.83.124.85:48008',
            'proxy-type': 'http',
            'proxy-auth': 'pp-nobfizze:hathapic'
        }, {
            'proxy': '194.242.113.229:30230',
            'proxy-type': 'http',
            'proxy-auth': 'pp-dawnyrou:dupradin'
        }, {
            'proxy': '118.127.29.47:10858',
            'proxy-type': 'http',
            'proxy-auth': 'pp-eyakarpe:rmsaingr'
        })

        for profile in browser_profiles:
            if profile['proxy']:
                proxy = {}
                proxy['host'] = profile['proxy']
                proxy['type'] = profile['proxy-type']
                if profile['proxy-auth']:
                    proxy['auth'] = profile['proxy-auth']
            else:
                proxy = None
            browser = PhantomJS.create_browser(proxy=proxy)
            user_agent = browser.desired_capabilities[
                u'phantomjs.page.settings.userAgent']
            self._browsers.append({
                'webdriver':
                PhantomJS.create_browser(proxy=proxy),
                'useragent':
                user_agent,
                'proxy':
                profile['proxy']
            })

        self._today_result_ids = {}
        file_last_results = os.path.join(HERE, self.F_LAST_RESULTS)
        if os.path.exists(file_last_results):
            today = time.gmtime().tm_yday
            last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday
            if last_day == today:
                shutil.copy(file_last_results, '%s.bak' % file_last_results)
                with open(file_last_results) as f_today:
                    reader = csv.DictReader(f_today)
                    for row in reader:
                        self._today_result_ids[row['identifier']] = row
Ejemplo n.º 2
0
    def __init__(self, *args, **kwargs):
        super(GoogleSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self._browsers = []

        proxies = [
            '23.19.154.246:3128', '23.19.154.247:3128', '23.19.154.248:3128',
            '23.19.154.249:3128', '23.19.154.250:3128', '23.19.188.246:3128',
            '23.19.188.247:3128', '23.19.188.248:3128', '23.19.188.249:3128',
            '23.19.188.250:3128'
        ]

        user_agents = cycle([
            'Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0',
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
            'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:24.0) Gecko/20100101 Firefox/24.0',
            'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0'
        ])

        browser_profiles = []

        for proxy_ip_port in proxies:
            browser_profiles.append({
                'proxy': proxy_ip_port,
                'proxy-type': 'http',
                'proxy-auth': None
            })

        for profile in browser_profiles:
            if profile['proxy']:
                proxy = {}
                proxy['host'] = profile['proxy']
                proxy['type'] = profile['proxy-type']
                if profile['proxy-auth']:
                    proxy['auth'] = profile['proxy-auth']
            else:
                proxy = None
            browser = PhantomJS.create_browser(proxy=proxy,
                                               user_agent=user_agents.next())
            user_agent = browser.desired_capabilities[
                u'phantomjs.page.settings.userAgent']
            self._browsers.append({
                'webdriver': browser,
                'useragent': user_agent,
                'proxy': profile['proxy']
            })

        self._today_result_ids = {}
        file_last_results = os.path.join(HERE, self.F_LAST_RESULTS)
        if os.path.exists(file_last_results):
            today = time.gmtime().tm_yday
            last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday
            if last_day == today:
                shutil.copy(file_last_results, '%s.bak' % file_last_results)
                with open(file_last_results) as f_today:
                    reader = csv.DictReader(f_today)
                    for row in reader:
                        self._today_result_ids[row['identifier']] = row
Ejemplo n.º 3
0
    def __init__(self, *args, **kwargs):
        super(RubbermaidSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self._browser = PhantomJS.create_browser()

        max_wait = 60
        self._browser.set_page_load_timeout(max_wait)
        self._browser.set_script_timeout(max_wait)
Ejemplo n.º 4
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        categories = hxs.select('//div[@id="breadcrumb_container"]//a/text()').extract()
        if categories:
            category = categories[-1]
        else:
            category = None

        name = hxs.select('//div[@id="overview_tab_content"]/h2/text()').extract()
        if name:
            name = name[0].strip()
        if not name:
            name = hxs.select('//title/text()').extract()[0].split('-')
            name = name[0:-1] if len(name) > 1 else name
            name = '-'.join(name).strip()

        sku = hxs.select('//span[@id="product_reference"]/text()').extract()

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('identifier', '//span[@id="product_reference"]/text()')
        loader.add_value('url', response.url)
        loader.add_value('name', name.strip())
        if sku:
            loader.add_value('sku', sku[0].replace(' ', ''))
        price = hxs.select('//div[@id="product_price"]//span[@id="product_price_sale"]'
                           '//span[@class="price"]//span[@class="ex"]//span[@class="GBP"]/text()').extract()
        price = re.sub(u'[^\d\.]', u'', price[0].strip())
        # loader.add_value('price', str(round(Decimal(price) / Decimal(1.2), 2)))
        loader.add_value('price', price)
        if category:
            loader.add_value('category', category)

        img = hxs.select('//img[@id="product_medium_image"]/@src').extract()
        if img:
            loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))
        brand = hxs.select('//div[@id="product_page_brand"]/a/@title').extract()
        brand = brand[0] if brand else ''
        loader.add_value('brand', brand)

        item = loader.load_item()

        if not item['identifier'] or item['identifier'].strip() == 'n/a':
            browser = PhantomJS.create_browser()
            self.log('>>> BROWSER: GET => response.url')
            browser.get(response.url)
            self.log('BROWSER: OK!')
            hxs = HtmlXPathSelector(browser.page_source)
            browser.quit()

            item['identifier'] = hxs.select('//span[@id="product_reference"]/text()').extract()[0].strip()

        yield item
Ejemplo n.º 5
0
    def __init__(self, *args, **kwargs):
        super(UkFlooringDirectSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self._browser = PhantomJS.create_browser()
Ejemplo n.º 6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            product_data = json.loads(
                hxs.select(
                    '//script[contains(text(), "walPP.variantDataRawArr")]/text()'
                ).re(r'walPP.variantDataRawArr = (\[.*\])')[0])[0]
        except:
            self.errors.append('WARNING: No product data in %s' % response.url)
            return

        price = product_data.get(u'price_store_price', None)
        if not price:
            browser = PhantomJS.create_browser()
            self.log('>>> BROWSER: GET => %s' % response.url)
            browser.get(response.url)
            self.log('>>> BROWSER: OK')
            time.sleep(5)

            hxs = HtmlXPathSelector(text=browser.page_source)

            browser.quit()

            # Monitor all products even without a price (as requested in #248)
            price = '.'.join(
                hxs.select(
                    '//div[@id="pricing"]/div[@class="price-main"]//text()').
                re(r'(\d+)')).strip()
            if not price:
                price_elem = hxs.select(
                    '//span[@id="store-price"][1]/text()').extract()
                if price_elem:
                    price = price_elem[0]
            if not price:
                store_prices = hxs.select(
                    '//div[contains(@id, "store-")]//div[@class="price"]//text()'
                ).extract()
                try:
                    price = '.'.join(
                        re.findall(r'(\d+)', '.'.join(store_prices[:3])))
                except:
                    price = '0.00'
        else:
            price = price[0]

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('category', product_data[u'Category'])
        product_loader.add_value('name', product_data[u'prod_name_en'])
        product_loader.add_value('sku', product_data[u'P_RollupKey'])
        product_loader.add_value('price', price.replace(',', ''))
        product_loader.add_value('identifier', product_data[u'P_UniqueKey'])

        product_loader.add_value('url', response.url)
        product_loader.add_value('brand',
                                 response.meta['brand'].strip().lower())
        product = product_loader.load_item()

        metadata = KeterMeta()
        metadata['brand'] = response.meta['brand']
        metadata['reviews'] = []
        product['metadata'] = metadata
        response.meta['product'] = product

        # the same as canadiantire.ca
        # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en
        # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml
        # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script>
        try:
            part2 = product['sku']
        except:
            self.errors.append('WARNING: No sku in %s' % response.url)
            yield product
        else:
            if not part2:
                self.errors.append('WARNING: No sku in %s' % response.url)
                yield product
            else:
                reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=e6wzzmz844l2kk3v6v7igfl6i&apiversion=5.4&displaycode=2036-en_ca&resource.q2=reviews&filter.q2=isratingsonly%3Aeq%3Afalse&filter.q2=productid%3Aeq%3A' + part2
                yield Request(reviews_url,
                              meta=response.meta,
                              callback=self.parse_reviews)