Ejemplo n.º 1
0
    def __init__(self, captcha_retries='10', *args, **kwargs):
        super(AmazonProductsSpider, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)

        self.mtp_class = Amazon_marketplace(self)

        self._cbw = CaptchaBreakerWrapper()
Ejemplo n.º 2
0
    def __init__(self, outfile=None, test_category=None):
        self.outfile = outfile

        # if this is set, only crawl this category (level 2/1 category name). used for testing
        self.test_category = test_category

        # if test category is set and no output file was specified, set the name of outfile to a special "test" name
        if self.test_category and not self.outfile:
            self.outfile = "amazon_categories_test.jl"

        # level that is considered to contain departments
        self.DEPARTMENT_LEVEL = 2

        # hardcoded toplevel categories (level 1 and 2) urls to replace/supplement some of the ones found on the sitemap above (point to the same category, but have different page content. they were found manually)
        # reason: they provide more info regarding product count than the ones found on the sitemap
        # keys are categories names as found in the sitemap, values are URLs associated with them, that will replace/supplement the links found on the sitemap
        self.EXTRA_TOPLEVEL_CATEGORIES_URLS = {
                                    "Baby" : "http://www.amazon.com/s/ref=lp_166835011_ex_n_1?rh=n%3A165796011&bbn=165796011&ie=UTF8&qid=1393338541", \
                                    "Electronics & Computers" : "http://www.amazon.com/s/ref=lp_172659_ex_n_1?rh=n%3A172282&bbn=172282&ie=UTF8&qid=1393338741", \
                                    "Home, Garden & Tools" : "http://www.amazon.com/s/ref=lp_284507_ex_n_1?rh=n%3A1055398&bbn=1055398&ie=UTF8&qid=1393338782",\
                                    "Kindle E-readers & Books" : "http://www.amazon.com/s/ref=lp_154606011_ex_n_1?rh=n%3A133140011&bbn=133140011&ie=UTF8&qid=1395704970", \
                                    "Apps & Games" : "http://www.amazon.com/b/ref=sd_allcat_fire_apps_games?ie=UTF8&node=3427287011", \
                                    "Movies & TV" : "http://www.amazon.com/action-adventure-dvd-bluray/b/ref=MoviesHPBB_Genres_Action?ie=UTF8&node=2650363011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-left-2&pf_rd_r=0GAWFEZ3EXP8PEYCM6X3&pf_rd_t=101&pf_rd_p=1753817742&pf_rd_i=2625373011", \
                                    "All Beauty" : "http://www.amazon.com/s/ref=lp_11059031_ex_n_1?rh=n%3A3760911&bbn=3760911&ie=UTF8&qid=1395793680",\
                                    "Health, Household & Baby Care" : "http://www.amazon.com/s/ref=lp_6183682011_ex_n_1?rh=n%3A3760901&bbn=3760901&ie=UTF8&qid=1395822180", \
                                    "Tires & Wheels" : "http://www.amazon.com/s/ref=lp_353609011_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A15706571&bbn=15706571&ie=UTF8&qid=1395824546", \
                                    "Motorcycle & Powersports" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A346333011&bbn=346333011&ie=UTF8&qid=1395824599", \
                                    "Automotive & Industrial" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181&bbn=15684181" # this is partial - "Automotive and industrial" also contains the "Industrial & Scientific" cats which can be found in the sitemap
                                    }

        # flag indicating whether to compute overall product counts in pipelines phase for this spider.
        # if on, 'catid' and 'parent_catid' fields need to be implemented
        self.compute_nrproducts = True

        # counter for department id, will be used to autoincrement department id
        self.department_count = 0
        # counter for category id
        self.catid = 0

        # level to stop crawling (don't extract subcategories below this level)
        self.LEVEL_BARRIER = -2

        # maximum number of retries when presented with captcha form
        self.MAX_CAPTCHA_RETRY = 10

        # dictionarties associating department names with other attributes - to use for setting parent category info for level 1 categories
        # associates department names with their ids
        self.departments_ids = {}
        # associates department names with their urls (will be available only for extra_categories)
        self.department_urls = {}
        # associate department names with their category ids
        self.departments_cat_ids = {}

        # captcha breaker
        self.CB = CaptchaBreakerWrapper()
Ejemplo n.º 3
0
    def __init__(self, *args, **kwargs):
        # For some reason amazon fail to scrape most data
        # when you turn off variants
        self.ignore_variant_data = False
        self.product_url = kwargs['product_url']

        # See https://bugzilla.contentanalyticsinc.com/show_bug.cgi?id=3313#c0
        self.num_pages = int(kwargs.get('num_pages', 1))

        # # variants are switched off by default, see Bugzilla 3982#c11
        # self.scrape_variants_with_extra_requests = False
        # if 'scrape_variants_with_extra_requests' in kwargs:
        #     scrape_variants_with_extra_requests = \
        #         kwargs['scrape_variants_with_extra_requests']
        #     if scrape_variants_with_extra_requests in \
        #             (1, '1', 'true', 'True', True):
        #         self.scrape_variants_with_extra_requests = True

        # Default price currency
        self.price_currency = 'USD'
        self.price_currency_view = '$'

        # Locale
        self.locale = 'en-US'

        self.mtp_class = Amazon_marketplace(self)
        self._cbw = CaptchaBreakerWrapper()

        # #backup when total matches cannot be scraped
        # self.total_items_scraped = 0
        # # self.ranking_override = 0
        self.total_matches_re = r'of\s([\d\,]+)\s'
        super(AmazonShelfPagesSpider, self).__init__(*args, **kwargs)
        self._setup_class_compatibility()
        # self.remaining = self.quantity

        # For goldbox deals
        self.deal_response_json_list = []
        self.deal_product_url_list = []
        self.sorted_goldbox_deals_ids = []
Ejemplo n.º 4
0
    def __init__(self,
                 limit='100',
                 service_url=None,
                 captcha_retries='10',
                 *args,
                 **kwargs):
        super(UrlServiceSpider, self).__init__(*args, **kwargs)

        if service_url is None:
            raise AssertionError("Service URL is not optional.")

        self.limit = limit
        self.captcha_retries = int(captcha_retries)
        self.service_url = service_url

        self._cbw = CaptchaBreakerWrapper()

        queue_url = urlparse.urljoin(
            self.service_url, 'get_queued_urls/?limit=%d&block=%d') \
            % (int(limit), 0)
        self.log("Fetching URLs with '%s'." % queue_url, level=DEBUG)
        self.start_urls.append(queue_url)
Ejemplo n.º 5
0
    def __init__(self,
                 url_formatter=None,
                 client_url=None,
                 file_name=None,
                 product_asins=None,
                 captcha_retries='10',
                 *args, **kwargs):

        self.SEARCH_URL = client_url
        super(AmazonSpider, self).__init__(*args, **kwargs)

        if file_name:
            self.file_name = file_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        product_asins = json.loads(product_asins)
        self.product_asins = product_asins['asins']

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()
Ejemplo n.º 6
0
    print str(e)

try:
    from captcha_solver import CaptchaBreakerWrapper
except Exception as e:
    print '!!!!!!!!Captcha breaker is not available due to: %s' % e

    class CaptchaBreakerWrapper(object):
        @staticmethod
        def solve_captcha(url):
            msg("CaptchaBreaker in not available for url: %s" % url,
                level=WARNING)
            return None


_cbw = CaptchaBreakerWrapper()


def _has_captcha(response):
    return '.images-amazon.com/captcha/' in response.content


def _solve_captcha(response):
    soup = BeautifulSoup(response.content, "html.parser")
    forms = soup.findAll(itemprop="image")
    assert len(forms) == 1, "More than one form found."

    captcha_img = forms[0]['src']

    return _cbw.solve_captcha(captcha_img)
Ejemplo n.º 7
0
    def __init__(self,
                 url_formatter=None,
                 quantity=None,
                 page=None,
                 searchterms_str='laptop',
                 searchterms_fn=None,
                 site_name=None,
                 product_url=None,
                 user_agent=None,
                 captcha_retries='10',
                 *args,
                 **kwargs):
        if user_agent is None or user_agent not in self.USER_AGENTS.keys():
            self.log(
                "Not available user agent type or it wasn't set."
                " Default user agent will be used.", INFO)
            user_agent = 'default'

        if user_agent:
            self.user_agent = self.USER_AGENTS[user_agent]
            self.user_agent_key = user_agent

        super(AmazonSpider, self).__init__(*args, **kwargs)

        if site_name is None:
            assert len(self.allowed_domains) == 1, \
                "A single allowed domain is required to auto-detect site name."
            self.site_name = self.allowed_domains[0]
        else:
            self.site_name = site_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        if quantity is None:
            self.log("No quantity specified. Will retrieve all products.",
                     INFO)
            import sys
            self.quantity = sys.maxint
        else:
            self.quantity = int(quantity)

        if page is None:
            self.log("No page specified. Will retrieve all products.", INFO)
            import sys
            self.page = sys.maxint
        else:
            self.page = int(page)

        self.product_url = product_url

        self.searchterms = []
        if searchterms_str is not None:
            self.searchterms = searchterms_str.decode('utf-8').split(',')
        elif searchterms_fn is not None:
            with open(searchterms_fn, encoding='utf-8') as f:
                self.searchterms = f.readlines()
        else:
            self.log("No search terms provided!", ERROR)

        self.log(
            "Created for %s with %d search terms." %
            (self.site_name, len(self.searchterms)), INFO)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()
Ejemplo n.º 8
0
    def __init__(self, captcha_retries='10', *args, **kwargs):

        super(AmazonBaseClass, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()