コード例 #1
0
    def closed(self, reason):
        try:
            ip_dict = {i:self.ip_addresses.count(i) for i in self.ip_addresses}
            ip_addr_str=""
            for ip, count in ip_dict.items():
                ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count)
            proxy_str=""
            for proxy, data in self.was_banned.items():
                proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0])
            #ip_addresses_str = "\n".join(list(set(self.ip_addresses)))
            print("Used ip addresses: \n{}".format(ip_addr_str))
            print( "Ban count proxies: \n{}".format(proxy_str))
            print( "Captcha response count: {}".format(self.captcha_count))
            #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key)
            #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key)
            #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key)
        except:
            pass

        self.drop_asins_already_crawled()

        send_msg(self.target, "Finished scraper {} with {} new products {} new images {} pages and reason: {}".format(self.name, len(self.df_products), len(self.df_mba_images), self.page_count, reason), self.api_key)
        
        # change types to fit with big query datatypes
        self.df_products['timestamp'] = self.df_products['timestamp'].astype('datetime64[ns]')
        self.df_mba_images['timestamp'] = self.df_mba_images['timestamp'].astype('datetime64[ns]')
        self.df_mba_relevance['timestamp'] = self.df_mba_relevance['timestamp'].astype('datetime64[ns]')
        self.df_mba_relevance['number'] = self.df_mba_relevance['number'].astype('int')
        
        # drop duplicates by asin
        self.df_products = self.df_products.drop_duplicates(["asin"])
        self.df_mba_images = self.df_mba_images.drop_duplicates(["asin"])

 
        try:
            self.df_products.to_gbq("mba_" + self.marketplace + ".products",project_id="mba-pipeline", if_exists="append")
        except:
            time.sleep(10)
            try:
                self.df_products.to_gbq("mba_" + self.marketplace + ".products",project_id="mba-pipeline", if_exists="append")
            except:
                self.store_df()

        try:
            self.df_mba_images.to_gbq("mba_" + self.marketplace + ".products_mba_images",project_id="mba-pipeline", if_exists="append")
        except:
            time.sleep(10)
            try:
                self.df_mba_images.to_gbq("mba_" + self.marketplace + ".products_mba_images",project_id="mba-pipeline", if_exists="append")
            except:
                self.store_df()

        try:
            self.df_mba_relevance.to_gbq("mba_" + self.marketplace + ".products_mba_relevance",project_id="mba-pipeline", if_exists="append")
        except:
            time.sleep(10)
            try:
                self.df_mba_relevance.to_gbq("mba_" + self.marketplace + ".products_mba_relevance",project_id="mba-pipeline", if_exists="append")
            except:
                self.store_df()
コード例 #2
0
    def start_requests(self):
        urls_mba = []
        headers = get_random_headers(self.marketplace)
        # case use a csv with search terms
        if not self.df_search_terms.empty:
            for i, df_row in self.df_search_terms.iterrows():
                search_term = df_row["search_term"]
                url_mba = url_creator.main([search_term, self.marketplace, self.pod_product, self.sort])
                url_mba_page = url_mba + "&page=1"#+"&ref=sr_pg_"+str(page_number)
                urls_mba.append(url_mba_page)
        else:
            url_mba = url_creator.main([self.keyword, self.marketplace, self.pod_product, self.sort])
            send_msg(self.target, "Start scraper {} marketplace {} with {} pages and start page {} and sort {}".format(self.name, self.marketplace, self.pages, self.start_page, self.sort), self.api_key)
            # if start_page is other than one, crawler should start from differnt page
            until_page = 401

            if self.pages != 0:
                until_page = self.start_page + self.pages
            for page_number in np.arange(self.start_page, until_page, 1):
                if page_number <= 400:
                    url_mba_page = url_mba + "&page="+str(page_number)#+"&ref=sr_pg_"+str(page_number)
                    urls_mba.append(url_mba_page)
        for i, url_mba in enumerate(urls_mba):
            page = i + self.start_page
            # if self.marketplace == "com": 
            #     url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html"
            #     yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=headers, priority=i, data=self.change_zip_code_post_data,
            #                         errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers})
            # else:
            yield scrapy.Request(url=url_mba, callback=self.parse, headers=headers, priority=i,
                                    errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers})
コード例 #3
0
    def closed(self, reason):
        try:
            self.reset_ban.cancel()
        except Exception as e:
            send_msg(self.target, "Could not cancel ban reset function",
                     self.api_key)
            print("Could not cancel ban reset function", str(e))
        try:
            ip_dict = {
                i: self.ip_addresses.count(i)
                for i in self.ip_addresses
            }
            ip_addr_str = ""
            for ip, count in ip_dict.items():
                ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count)
            proxy_str = ""
            for proxy, data in self.was_banned.items():
                proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0])
            print(ip_addr_str)
            print(proxy_str)
            ip_addresses_str = "\n".join(list(set(self.ip_addresses)))
            #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key)
            #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key)
            print(proxy_str)
            print("Used ip addresses: \n{}".format(ip_addr_str))
            print("Ban count proxies: \n{}".format(proxy_str))
            #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key)
        except:
            pass

        self.drop_asins_already_crawled()

        self.df_mba_images['timestamp'] = self.df_mba_images[
            'timestamp'].astype('datetime64[ns]')
        self.df_mba_images = self.df_mba_images.drop_duplicates(["asin"])

        try:
            self.df_mba_images.to_gbq("mba_" + self.marketplace +
                                      ".products_mba_images",
                                      project_id="mba-pipeline",
                                      if_exists="append")
        except:
            time.sleep(10)
            try:
                self.df_mba_images.to_gbq("mba_" + self.marketplace +
                                          ".products_mba_images",
                                          project_id="mba-pipeline",
                                          if_exists="append")
            except:
                self.store_df()
コード例 #4
0
    def parse(self, response):
        self.ip_addresses.append(response.ip_address.compressed)
        asin = response.meta["asin"]
        if self.is_captcha_required(response):
            send_msg(self.target, "Captcha required" + " | asin: " + asin,
                     self.api_key)
            self.captcha_count = self.captcha_count + 1
            # add download dely if captcha happens
            self.settings.attributes[
                "DOWNLOAD_DELAY"].value = self.settings.attributes[
                    "DOWNLOAD_DELAY"].value + 3
            if self.captcha_count > self.settings.attributes[
                    "MAX_CAPTCHA_NUMBER"].value:
                raise CloseSpider(reason='To many catchas received')
            raise Exception("Captcha required")

        try:
            price_str, price = self.get_price(response)
        except Exception as e:
            self.save_content(response, asin)
            send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
            raise e
        try:
            mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = self.get_bsr(
                response)
        except Exception as e:
            self.save_content(response, asin)
            send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
            raise e
        try:
            customer_review_score_mean, customer_review_score, customer_review_count = self.get_customer_review(
                response)
        except Exception as e:
            self.save_content(response, asin)
            send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
            raise e

        crawlingdate = datetime.datetime.now()
        df = pd.DataFrame(
            data={
                "asin": [asin],
                "price": [price],
                "price_str": [price_str],
                "bsr": [mba_bsr],
                "bsr_str": [mba_bsr_str],
                "array_bsr": [array_mba_bsr],
                "array_bsr_categorie": [array_mba_bsr_categorie],
                "customer_review_score_mean": [customer_review_score_mean],
                "customer_review_score": [customer_review_score],
                "customer_review_count": [customer_review_count],
                "timestamp": [crawlingdate]
            })
        self.df_products_details = self.df_products_details.append(df)

        if self.captcha_count > self.settings.attributes[
                "MAX_CAPTCHA_NUMBER"].value:
            raise CloseSpider(reason='To many catchas received')
コード例 #5
0
 def start_requests(self):
     urls = pd.read_csv(
         "mba_crawler/url_data/urls_mba_daily_de.csv")["url"].tolist()
     asins = pd.read_csv(
         "mba_crawler/url_data/urls_mba_daily_de.csv")["asin"].tolist()
     send_msg(
         self.target,
         "Start scraper {} with {} products".format(self.name, len(urls)),
         self.api_key)
     for i, url in enumerate(urls):
         #proxies = proxy_handler.get_random_proxy_url_dict()
         headers = get_random_headers(self.marketplace)
         asin = asins[i]
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              headers=headers,
                              errback=self.errback_httpbin,
                              meta={"asin":
                                    asin})  # "proxy": proxies["http"],
コード例 #6
0
 def closed(self, reason):
     try:
         ip_dict = {
             i: self.ip_addresses.count(i)
             for i in self.ip_addresses
         }
         ip_addr_str = ""
         for ip, count in ip_dict.items():
             ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count)
         #ip_addresses_str = "\n".join(list(set(self.ip_addresses)))
         send_msg(self.target,
                  "Used ip addresses: \n{}".format(ip_addr_str),
                  self.api_key)
     except:
         pass
     send_msg(
         self.target,
         "Finished scraper {} with {} products and reason: {}".format(
             self.name, len(self.df_products_details),
             reason), self.api_key)
     self.df_products_details['timestamp'] = self.df_products_details[
         'timestamp'].astype('datetime64')
     self.df_products_details['bsr'] = self.df_products_details[
         'bsr'].astype('int')
     self.df_products_details[
         'customer_review_count'] = self.df_products_details[
             'customer_review_count'].astype('int')
     # update data in bigquery if batch is finished
     #'''
     try:
         self.df_products_details.to_gbq("mba_" + self.marketplace +
                                         ".products_details_daily",
                                         project_id="mba-pipeline",
                                         if_exists="append")
     except:
         time.sleep(10)
         try:
             self.df_products_details.to_gbq("mba_" + self.marketplace +
                                             ".products_details_daily",
                                             project_id="mba-pipeline",
                                             if_exists="append")
         except:
             self.store_df()
コード例 #7
0
 def start_requests(self):
     self.reset_was_banned_every_hour()
     urls = pd.read_csv(self.url_data_path)["url"].tolist()
     asins = pd.read_csv(self.url_data_path)["asin"].tolist()
     send_msg(
         self.target, "Start scraper {} daily {} with {} products".format(
             self.name, self.daily, len(urls)), self.api_key)
     for i, url in enumerate(urls):
         #proxies = proxy_handler.get_random_proxy_url_dict()
         headers = get_random_headers(self.marketplace)
         asin = asins[i]
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              headers=headers,
                              priority=1,
                              errback=self.errback_httpbin,
                              meta={
                                  "asin": asin,
                                  "max_proxies_to_try": 20
                              })  # "proxy": proxies["http"],
コード例 #8
0
    def errback_httpbin(self, failure):
        # log all errback failures,
        # in case you want to do something special for some errors,
        # you may need the failure's type
        self.logger.error(repr(failure))

        #if isinstance(failure.value, HttpError):
        if failure.check(HttpError):
            # you can get the response
            response = failure.value.response
            try:
                # if 404 update big query
                if response.status == 404:
                    crawlingdate = datetime.datetime.now()
                    df = pd.DataFrame(
                        data={
                            "asin": [response.meta["asin"]],
                            "price": [404.0],
                            "price_str": ["404"],
                            "bsr": [404],
                            "bsr_str": ["404"],
                            "array_bsr": [["404"]],
                            "array_bsr_categorie": [["404"]],
                            "customer_review_score_mean": [404.0],
                            "customer_review_score": ["404"],
                            "customer_review_count": [404],
                            "timestamp": [crawlingdate]
                        })
                    self.df_products_details = self.df_products_details.append(
                        df)
                    print(
                        "HttpError on asin: {} | status_code: {} | ip address: {}"
                        .format(response.meta["asin"], response.status,
                                response.ip_address.compressed))
                else:
                    send_msg(
                        self.target,
                        "HttpError on asin: {} | status_code: {} | ip address: {}"
                        .format(response.meta["asin"], response.status,
                                response.ip_address.compressed), self.api_key)
            except:
                pass
            self.logger.error('HttpError on %s', response.url)

        #elif isinstance(failure.value, DNSLookupError):
        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            send_msg(self.target,
                     "DNSLookupError on url: {}".format(request.url),
                     self.api_key)
            self.logger.error('DNSLookupError on %s', request.url)

        #elif isinstance(failure.value, TimeoutError):
        elif failure.check(TimeoutError):
            request = failure.request
            send_msg(self.target,
                     "TimeoutError on url: {}".format(request.url),
                     self.api_key)
            self.logger.error('TimeoutError on %s', request.url)
コード例 #9
0
    def closed(self, reason):
        try:
            self.reset_ban.cancel()
        except Exception as e:
            send_msg(self.target, "Could not cancel ban reset function",
                     self.api_key)
            print("Could not cancel ban reset function", str(e))
        try:
            ip_dict = {
                i: self.ip_addresses.count(i)
                for i in self.ip_addresses
            }
            ip_addr_str = ""
            for ip, count in ip_dict.items():
                ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count)
            proxy_str = ""
            for proxy, data in self.was_banned.items():
                proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0])
            print(ip_addr_str)
            print(proxy_str)
            ip_addresses_str = "\n".join(list(set(self.ip_addresses)))
            #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key)
            #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key)
            print(proxy_str)
            print("Used ip addresses: \n{}".format(ip_addr_str))
            print("Ban count proxies: \n{}".format(proxy_str))
            #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key)
        except:
            pass

        client = bigquery.Client()

        # delete all asins in asin_list_remove_from_blacklist from black list
        SQL_IS_IN = "({})".format(",".join(
            ["'%s'" % v for v in self.asin_list_remove_from_blacklist]))
        query_job = client.query(
            """DELETE FROM `mba-pipeline.mba_de.products_no_mba_shirt` WHERE asin in {}
            """.format(SQL_IS_IN))
        results = query_job.result()
コード例 #10
0
    def parse(self, response):
        proxy = self.get_proxy(response)
        url = response.url
        page = response.meta["page"]
        image_urls = []
        asins = []
        url_mba_lowqs = []

        #self.get_zip_code_location(response)
        #self.get_count_results(response)

        if self.is_captcha_required(response):
            #self.response_is_ban(request, response, is_ban=True)
            print("Captcha required for proxy: " + proxy)
            self.captcha_count = self.captcha_count + 1
            self.update_ban_count(proxy)            
            headers = get_random_headers(self.marketplace)
            # send new request with high priority
            request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True,
                                    errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page})
            yield request
        else:
            
            if self.should_zip_code_be_changed(response):
                print("Proxy does not get all .com results: " + proxy)
                self.update_ban_count(proxy)   
                headers = get_random_headers(self.marketplace)
                # send new request with high priority
                request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True,
                                        errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page})
                yield request
                # change zip code
                # meta_dict = {"max_proxies_to_try": 30, 'page': page, "url": url, "headers": response.meta["headers"]}
                # url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html"
                # if self.is_perfect_privacy_proxy(response):
                #     proxy = "http://*****:*****@" + response.meta["download_slot"] + ":3128"
                # meta_dict.update({"proxy": proxy, "_rotating_proxy": False})
                # yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=response.meta["headers"], priority=0, data=self.change_zip_code_post_data,
                #                     errback=self.errback_httpbin, meta=meta_dict, dont_filter=True)
            else:
                self.ip_addresses.append(response.ip_address.compressed)
                shirts = response.css('div.sg-col-inner')
                shirt_number_page = 0
                for i, shirt in enumerate(shirts):
                    if not self.is_shirt(shirt):
                        continue
                    shirt_number_page = shirt_number_page + 1
                    try:
                        price = self.get_price(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        title = self.get_title(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        brand = self.get_brand(shirt)
                    except Exception as e:
                        print("Could not get brand of shirt: ",title)
                        brand = None
                        # its possible that amazon does not show brand on overview page. Therefore raise is not neccessary.
                        #self.save_content(response, url)
                        #send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        #raise e
                    try:
                        url_product = self.get_url_product(shirt, url)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        url_image_lowq,url_image_q2,url_image_q3,url_image_q4,url_image_hq = self.get_img_urls(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        asin = self.get_asin(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        uuid = self.get_uuid(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                        
                    crawlingdate = datetime.datetime.now()
                    # append to general crawler
                    df_products = pd.DataFrame(data={"title":[title],"brand":[brand],"url_product":[url_product],"url_image_lowq":[url_image_lowq],"url_image_hq":[url_image_hq],"price":[price],"asin":[asin],"uuid":[uuid], "timestamp":[crawlingdate]})
                    df_mba_images = pd.DataFrame(data={"asin":[asin],"url_image_lowq":[url_image_lowq],"url_image_q2":[url_image_q2], "url_image_q3":[url_image_q3], "url_image_q4":[url_image_q4],"url_image_hq":[url_image_hq], "timestamp":[crawlingdate]})
                    shirt_number = int(shirt_number_page + ((int(page)-1)*self.shirts_per_page))
                    df_mba_relevance = pd.DataFrame(data={"asin":[asin],"sort":[self.sort],"number":[shirt_number],"timestamp":[crawlingdate]})

                    self.df_products = self.df_products.append(df_products)
                    self.df_mba_images = self.df_mba_images.append(df_mba_images)
                    self.df_mba_relevance = self.df_mba_relevance.append(df_mba_relevance)

                    # crawl only image if not already crawled
                    if asin not in self.products_images_already_downloaded:
                        image_urls.append(url_image_hq)
                        asins.append(asin)
                        url_mba_lowqs.append(url_image_lowq)

                # crawl images
                image_item = MbaCrawlerItem()
                image_item["image_urls"] = image_urls
                image_item["asins"] = asins
                image_item["url_mba_lowqs"] = url_mba_lowqs
                image_item["marketplace"] = self.marketplace
                if self.marketplace in ["com", "de"]:
                    yield image_item
                
                self.page_count = self.page_count + 1
                self.status_update()


                #url_next = "/".join(url.split("/")[0:3]) + response.css("ul.a-pagination li.a-last a::attr(href)").get()
                
                '''
コード例 #11
0
    def closed(self, reason):
        try:
            self.reset_ban.cancel()
        except Exception as e:
            send_msg(self.target, "Could not cancel ban reset function",
                     self.api_key)
            print("Could not cancel ban reset function", str(e))
        try:
            ip_dict = {
                i: self.ip_addresses.count(i)
                for i in self.ip_addresses
            }
            ip_addr_str = ""
            for ip, count in ip_dict.items():
                ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count)
            proxy_str = ""
            for proxy, data in self.was_banned.items():
                proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0])
            print(ip_addr_str)
            print(proxy_str)
            ip_addresses_str = "\n".join(list(set(self.ip_addresses)))
            #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key)
            #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key)
            print(proxy_str)
            print("Used ip addresses: \n{}".format(ip_addr_str))
            print("Ban count proxies: \n{}".format(proxy_str))
            #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key)
        except:
            pass
        send_msg(
            self.target,
            "Finished scraper {} daily {} with {} products and reason: {}".
            format(self.name, self.daily, len(self.df_products_details_daily),
                   reason), self.api_key)
        print("Finished scraper {} daily {} with {} products and reason: {}".
              format(self.name, self.daily,
                     len(self.df_products_details_daily), reason))
        if not self.daily:
            # change types to fit with big query datatypes
            self.df_products_details['color_count'] = self.df_products_details[
                'color_count'].astype('int')
            self.df_products_details['timestamp'] = self.df_products_details[
                'timestamp'].astype('datetime64[ns]')
            self.df_products_details['upload_date'] = self.df_products_details[
                'upload_date'].astype('datetime64[ns]')
            self.df_products_details[
                'customer_review_count'] = self.df_products_details[
                    'customer_review_count'].astype('int')

        # change types of daily dataframe
        self.df_products_details_daily[
            'timestamp'] = self.df_products_details_daily['timestamp'].astype(
                'datetime64[ns]')
        self.df_products_details_daily['bsr'] = self.df_products_details_daily[
            'bsr'].astype('int')
        self.df_products_details_daily[
            'customer_review_count'] = self.df_products_details_daily[
                'customer_review_count'].astype('int')

        # update data in bigquery if batch is finished
        if not self.daily:
            try:
                self.df_products_details.to_gbq("mba_" + self.marketplace +
                                                ".products_details",
                                                project_id="mba-pipeline",
                                                if_exists="append")
            except:
                time.sleep(10)
                try:
                    self.df_products_details.to_gbq("mba_" + self.marketplace +
                                                    ".products_details",
                                                    project_id="mba-pipeline",
                                                    if_exists="append")
                except:
                    self.store_df()

        try:
            self.df_products_details_daily.to_gbq("mba_" + self.marketplace +
                                                  ".products_details_daily",
                                                  project_id="mba-pipeline",
                                                  if_exists="append")
        except:
            time.sleep(10)
            try:
                self.df_products_details_daily.to_gbq(
                    "mba_" + self.marketplace + ".products_details_daily",
                    project_id="mba-pipeline",
                    if_exists="append")
            except:
                self.store_df()

        if not self.df_products_no_mba_shirt.empty:
            self.df_products_no_mba_shirt.to_gbq("mba_" + self.marketplace +
                                                 ".products_no_mba_shirt",
                                                 project_id="mba-pipeline",
                                                 if_exists="append")

        print(self.df_products_no_bsr)
        if not self.df_products_no_bsr.empty:
            self.df_products_no_bsr.to_gbq("mba_" + self.marketplace +
                                           ".products_no_bsr",
                                           project_id="mba-pipeline",
                                           if_exists="append")
コード例 #12
0
    def parse(self, response):
        asin = response.meta["asin"]
        proxy = self.get_proxy(response)

        url = response.url
        #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key)
        if self.is_captcha_required(response):
            #self.response_is_ban(request, response, is_ban=True)
            print("Captcha required for proxy: " + proxy)
            self.captcha_count = self.captcha_count + 1
            self.update_ban_count(proxy)
            #send_msg(self.target, "Captcha: " + url, self.api_key)

            headers = get_random_headers(self.marketplace)
            # send new request with high priority
            request = scrapy.Request(url=url,
                                     callback=self.parse,
                                     headers=headers,
                                     priority=0,
                                     dont_filter=True,
                                     errback=self.errback_httpbin,
                                     meta={"asin": asin})
            yield request
            '''
            raise Exception("Captcha required")
            send_msg(self.target, "Captcha required" + " | asin: " + asin, self.api_key)
            self.captcha_count = self.captcha_count + 1
            # add download dely if captcha happens
            self.settings.attributes["DOWNLOAD_DELAY"].value = self.settings.attributes["DOWNLOAD_DELAY"].value + 3
            if self.captcha_count > self.settings.attributes["MAX_CAPTCHA_NUMBER"].value:
                raise CloseSpider(reason='To many catchas received')
            raise Exception("Captcha required")
            '''
        # do not proceed if its not a mba shirt
        elif not self.is_mba_shirt(response):
            self.df_products_no_mba_shirt = self.df_products_no_mba_shirt.append(
                pd.DataFrame(
                    data={
                        "asin": [asin],
                        "url": [url],
                        "timestamp": [datetime.datetime.now()]
                    }))
        else:
            self.ip_addresses.append(response.ip_address.compressed)
            try:
                price_str, price = self.get_price(response)
            except Exception as e:
                #self.save_content(response, asin)
                #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
                price_str, price = "", 0.0
            try:
                mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = self.get_bsr(
                    response)
            except Exception as e:
                self.save_content(response, asin)
                #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
                if "no bsr" in str(e):
                    self.df_products_no_bsr = self.df_products_no_bsr.append(
                        pd.DataFrame(
                            data={
                                "asin": [asin],
                                "url": [url],
                                "timestamp": [datetime.datetime.now()]
                            }))
                if self.daily:
                    raise e
                else:
                    # Cases exists like https://www.amazon.com/dp/B0855BCBZ6, which should have BSR but dont contain it on html
                    # Therefore, we want to crawl it just once (if not daily crawl)
                    mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = "", 0, [], []
            try:
                customer_review_score_mean, customer_review_score, customer_review_count = self.get_customer_review(
                    response)
            except Exception as e:
                self.save_content(response, asin)
                send_msg(self.target,
                         str(e) + " | asin: " + asin, self.api_key)
                raise e
            # if not daily crawler more data of website need to be crawled
            if not self.daily:
                try:
                    title = self.get_title(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    brand, url_brand = self.get_brand_infos(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    fit_types = self.get_fit_types(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    array_color_names, color_count = self.get_color_infos(
                        response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    array_product_feature = self.get_product_features(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    description = self.get_description(response)
                except Exception as e:
                    #self.save_content(response, asin)
                    #send_msg(self.target, str(e) + "| asin: " + asin, self.api_key)
                    #raise e
                    description = ""
                try:
                    weight = self.get_weight(response)
                except Exception as e:
                    weight = "not found"
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                try:
                    upload_date_str, upload_date = self.get_upload_date(
                        response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e

            crawlingdate = datetime.datetime.now()
            if not self.daily:
                # append to general crawler
                df = pd.DataFrame(
                    data={
                        "asin": [asin],
                        "title": [title],
                        "brand": [brand],
                        "url_brand": [url_brand],
                        "price": [price_str],
                        "fit_types": [fit_types],
                        "color_names": [array_color_names],
                        "color_count": [color_count],
                        "product_features": [array_product_feature],
                        "description": [description],
                        "weight": [weight],
                        "upload_date_str": [upload_date_str],
                        "upload_date": [upload_date],
                        "customer_review_score": [customer_review_score],
                        "customer_review_count": [customer_review_count],
                        "mba_bsr_str": [mba_bsr_str],
                        "mba_bsr": [array_mba_bsr],
                        "mba_bsr_categorie": [array_mba_bsr_categorie],
                        "timestamp": [crawlingdate]
                    })
                self.df_products_details = self.df_products_details.append(df)

            # append to daily crawler
            df = pd.DataFrame(
                data={
                    "asin": [asin],
                    "price": [price],
                    "price_str": [price_str],
                    "bsr": [mba_bsr],
                    "bsr_str": [mba_bsr_str],
                    "array_bsr": [array_mba_bsr],
                    "array_bsr_categorie": [array_mba_bsr_categorie],
                    "customer_review_score_mean": [customer_review_score_mean],
                    "customer_review_score": [customer_review_score],
                    "customer_review_count": [customer_review_count],
                    "timestamp": [crawlingdate]
                })
            self.df_products_details_daily = self.df_products_details_daily.append(
                df)

            self.status_update()