def closed(self, reason): try: ip_dict = {i:self.ip_addresses.count(i) for i in self.ip_addresses} ip_addr_str="" for ip, count in ip_dict.items(): ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count) proxy_str="" for proxy, data in self.was_banned.items(): proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0]) #ip_addresses_str = "\n".join(list(set(self.ip_addresses))) print("Used ip addresses: \n{}".format(ip_addr_str)) print( "Ban count proxies: \n{}".format(proxy_str)) print( "Captcha response count: {}".format(self.captcha_count)) #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key) #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key) #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key) except: pass self.drop_asins_already_crawled() send_msg(self.target, "Finished scraper {} with {} new products {} new images {} pages and reason: {}".format(self.name, len(self.df_products), len(self.df_mba_images), self.page_count, reason), self.api_key) # change types to fit with big query datatypes self.df_products['timestamp'] = self.df_products['timestamp'].astype('datetime64[ns]') self.df_mba_images['timestamp'] = self.df_mba_images['timestamp'].astype('datetime64[ns]') self.df_mba_relevance['timestamp'] = self.df_mba_relevance['timestamp'].astype('datetime64[ns]') self.df_mba_relevance['number'] = self.df_mba_relevance['number'].astype('int') # drop duplicates by asin self.df_products = self.df_products.drop_duplicates(["asin"]) self.df_mba_images = self.df_mba_images.drop_duplicates(["asin"]) try: self.df_products.to_gbq("mba_" + self.marketplace + ".products",project_id="mba-pipeline", if_exists="append") except: time.sleep(10) try: self.df_products.to_gbq("mba_" + self.marketplace + ".products",project_id="mba-pipeline", if_exists="append") except: self.store_df() try: self.df_mba_images.to_gbq("mba_" + self.marketplace + ".products_mba_images",project_id="mba-pipeline", if_exists="append") except: time.sleep(10) try: self.df_mba_images.to_gbq("mba_" + self.marketplace + ".products_mba_images",project_id="mba-pipeline", if_exists="append") except: self.store_df() try: self.df_mba_relevance.to_gbq("mba_" + self.marketplace + ".products_mba_relevance",project_id="mba-pipeline", if_exists="append") except: time.sleep(10) try: self.df_mba_relevance.to_gbq("mba_" + self.marketplace + ".products_mba_relevance",project_id="mba-pipeline", if_exists="append") except: self.store_df()
def start_requests(self): urls_mba = [] headers = get_random_headers(self.marketplace) # case use a csv with search terms if not self.df_search_terms.empty: for i, df_row in self.df_search_terms.iterrows(): search_term = df_row["search_term"] url_mba = url_creator.main([search_term, self.marketplace, self.pod_product, self.sort]) url_mba_page = url_mba + "&page=1"#+"&ref=sr_pg_"+str(page_number) urls_mba.append(url_mba_page) else: url_mba = url_creator.main([self.keyword, self.marketplace, self.pod_product, self.sort]) send_msg(self.target, "Start scraper {} marketplace {} with {} pages and start page {} and sort {}".format(self.name, self.marketplace, self.pages, self.start_page, self.sort), self.api_key) # if start_page is other than one, crawler should start from differnt page until_page = 401 if self.pages != 0: until_page = self.start_page + self.pages for page_number in np.arange(self.start_page, until_page, 1): if page_number <= 400: url_mba_page = url_mba + "&page="+str(page_number)#+"&ref=sr_pg_"+str(page_number) urls_mba.append(url_mba_page) for i, url_mba in enumerate(urls_mba): page = i + self.start_page # if self.marketplace == "com": # url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html" # yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=headers, priority=i, data=self.change_zip_code_post_data, # errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers}) # else: yield scrapy.Request(url=url_mba, callback=self.parse, headers=headers, priority=i, errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers})
def closed(self, reason): try: self.reset_ban.cancel() except Exception as e: send_msg(self.target, "Could not cancel ban reset function", self.api_key) print("Could not cancel ban reset function", str(e)) try: ip_dict = { i: self.ip_addresses.count(i) for i in self.ip_addresses } ip_addr_str = "" for ip, count in ip_dict.items(): ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count) proxy_str = "" for proxy, data in self.was_banned.items(): proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0]) print(ip_addr_str) print(proxy_str) ip_addresses_str = "\n".join(list(set(self.ip_addresses))) #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key) #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key) print(proxy_str) print("Used ip addresses: \n{}".format(ip_addr_str)) print("Ban count proxies: \n{}".format(proxy_str)) #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key) except: pass self.drop_asins_already_crawled() self.df_mba_images['timestamp'] = self.df_mba_images[ 'timestamp'].astype('datetime64[ns]') self.df_mba_images = self.df_mba_images.drop_duplicates(["asin"]) try: self.df_mba_images.to_gbq("mba_" + self.marketplace + ".products_mba_images", project_id="mba-pipeline", if_exists="append") except: time.sleep(10) try: self.df_mba_images.to_gbq("mba_" + self.marketplace + ".products_mba_images", project_id="mba-pipeline", if_exists="append") except: self.store_df()
def parse(self, response): self.ip_addresses.append(response.ip_address.compressed) asin = response.meta["asin"] if self.is_captcha_required(response): send_msg(self.target, "Captcha required" + " | asin: " + asin, self.api_key) self.captcha_count = self.captcha_count + 1 # add download dely if captcha happens self.settings.attributes[ "DOWNLOAD_DELAY"].value = self.settings.attributes[ "DOWNLOAD_DELAY"].value + 3 if self.captcha_count > self.settings.attributes[ "MAX_CAPTCHA_NUMBER"].value: raise CloseSpider(reason='To many catchas received') raise Exception("Captcha required") try: price_str, price = self.get_price(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = self.get_bsr( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: customer_review_score_mean, customer_review_score, customer_review_count = self.get_customer_review( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e crawlingdate = datetime.datetime.now() df = pd.DataFrame( data={ "asin": [asin], "price": [price], "price_str": [price_str], "bsr": [mba_bsr], "bsr_str": [mba_bsr_str], "array_bsr": [array_mba_bsr], "array_bsr_categorie": [array_mba_bsr_categorie], "customer_review_score_mean": [customer_review_score_mean], "customer_review_score": [customer_review_score], "customer_review_count": [customer_review_count], "timestamp": [crawlingdate] }) self.df_products_details = self.df_products_details.append(df) if self.captcha_count > self.settings.attributes[ "MAX_CAPTCHA_NUMBER"].value: raise CloseSpider(reason='To many catchas received')
def start_requests(self): urls = pd.read_csv( "mba_crawler/url_data/urls_mba_daily_de.csv")["url"].tolist() asins = pd.read_csv( "mba_crawler/url_data/urls_mba_daily_de.csv")["asin"].tolist() send_msg( self.target, "Start scraper {} with {} products".format(self.name, len(urls)), self.api_key) for i, url in enumerate(urls): #proxies = proxy_handler.get_random_proxy_url_dict() headers = get_random_headers(self.marketplace) asin = asins[i] yield scrapy.Request(url=url, callback=self.parse, headers=headers, errback=self.errback_httpbin, meta={"asin": asin}) # "proxy": proxies["http"],
def closed(self, reason): try: ip_dict = { i: self.ip_addresses.count(i) for i in self.ip_addresses } ip_addr_str = "" for ip, count in ip_dict.items(): ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count) #ip_addresses_str = "\n".join(list(set(self.ip_addresses))) send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key) except: pass send_msg( self.target, "Finished scraper {} with {} products and reason: {}".format( self.name, len(self.df_products_details), reason), self.api_key) self.df_products_details['timestamp'] = self.df_products_details[ 'timestamp'].astype('datetime64') self.df_products_details['bsr'] = self.df_products_details[ 'bsr'].astype('int') self.df_products_details[ 'customer_review_count'] = self.df_products_details[ 'customer_review_count'].astype('int') # update data in bigquery if batch is finished #''' try: self.df_products_details.to_gbq("mba_" + self.marketplace + ".products_details_daily", project_id="mba-pipeline", if_exists="append") except: time.sleep(10) try: self.df_products_details.to_gbq("mba_" + self.marketplace + ".products_details_daily", project_id="mba-pipeline", if_exists="append") except: self.store_df()
def start_requests(self): self.reset_was_banned_every_hour() urls = pd.read_csv(self.url_data_path)["url"].tolist() asins = pd.read_csv(self.url_data_path)["asin"].tolist() send_msg( self.target, "Start scraper {} daily {} with {} products".format( self.name, self.daily, len(urls)), self.api_key) for i, url in enumerate(urls): #proxies = proxy_handler.get_random_proxy_url_dict() headers = get_random_headers(self.marketplace) asin = asins[i] yield scrapy.Request(url=url, callback=self.parse, headers=headers, priority=1, errback=self.errback_httpbin, meta={ "asin": asin, "max_proxies_to_try": 20 }) # "proxy": proxies["http"],
def errback_httpbin(self, failure): # log all errback failures, # in case you want to do something special for some errors, # you may need the failure's type self.logger.error(repr(failure)) #if isinstance(failure.value, HttpError): if failure.check(HttpError): # you can get the response response = failure.value.response try: # if 404 update big query if response.status == 404: crawlingdate = datetime.datetime.now() df = pd.DataFrame( data={ "asin": [response.meta["asin"]], "price": [404.0], "price_str": ["404"], "bsr": [404], "bsr_str": ["404"], "array_bsr": [["404"]], "array_bsr_categorie": [["404"]], "customer_review_score_mean": [404.0], "customer_review_score": ["404"], "customer_review_count": [404], "timestamp": [crawlingdate] }) self.df_products_details = self.df_products_details.append( df) print( "HttpError on asin: {} | status_code: {} | ip address: {}" .format(response.meta["asin"], response.status, response.ip_address.compressed)) else: send_msg( self.target, "HttpError on asin: {} | status_code: {} | ip address: {}" .format(response.meta["asin"], response.status, response.ip_address.compressed), self.api_key) except: pass self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request send_msg(self.target, "DNSLookupError on url: {}".format(request.url), self.api_key) self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request send_msg(self.target, "TimeoutError on url: {}".format(request.url), self.api_key) self.logger.error('TimeoutError on %s', request.url)
def closed(self, reason): try: self.reset_ban.cancel() except Exception as e: send_msg(self.target, "Could not cancel ban reset function", self.api_key) print("Could not cancel ban reset function", str(e)) try: ip_dict = { i: self.ip_addresses.count(i) for i in self.ip_addresses } ip_addr_str = "" for ip, count in ip_dict.items(): ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count) proxy_str = "" for proxy, data in self.was_banned.items(): proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0]) print(ip_addr_str) print(proxy_str) ip_addresses_str = "\n".join(list(set(self.ip_addresses))) #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key) #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key) print(proxy_str) print("Used ip addresses: \n{}".format(ip_addr_str)) print("Ban count proxies: \n{}".format(proxy_str)) #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key) except: pass client = bigquery.Client() # delete all asins in asin_list_remove_from_blacklist from black list SQL_IS_IN = "({})".format(",".join( ["'%s'" % v for v in self.asin_list_remove_from_blacklist])) query_job = client.query( """DELETE FROM `mba-pipeline.mba_de.products_no_mba_shirt` WHERE asin in {} """.format(SQL_IS_IN)) results = query_job.result()
def parse(self, response): proxy = self.get_proxy(response) url = response.url page = response.meta["page"] image_urls = [] asins = [] url_mba_lowqs = [] #self.get_zip_code_location(response) #self.get_count_results(response) if self.is_captcha_required(response): #self.response_is_ban(request, response, is_ban=True) print("Captcha required for proxy: " + proxy) self.captcha_count = self.captcha_count + 1 self.update_ban_count(proxy) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page}) yield request else: if self.should_zip_code_be_changed(response): print("Proxy does not get all .com results: " + proxy) self.update_ban_count(proxy) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page}) yield request # change zip code # meta_dict = {"max_proxies_to_try": 30, 'page': page, "url": url, "headers": response.meta["headers"]} # url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html" # if self.is_perfect_privacy_proxy(response): # proxy = "http://*****:*****@" + response.meta["download_slot"] + ":3128" # meta_dict.update({"proxy": proxy, "_rotating_proxy": False}) # yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=response.meta["headers"], priority=0, data=self.change_zip_code_post_data, # errback=self.errback_httpbin, meta=meta_dict, dont_filter=True) else: self.ip_addresses.append(response.ip_address.compressed) shirts = response.css('div.sg-col-inner') shirt_number_page = 0 for i, shirt in enumerate(shirts): if not self.is_shirt(shirt): continue shirt_number_page = shirt_number_page + 1 try: price = self.get_price(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: title = self.get_title(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: brand = self.get_brand(shirt) except Exception as e: print("Could not get brand of shirt: ",title) brand = None # its possible that amazon does not show brand on overview page. Therefore raise is not neccessary. #self.save_content(response, url) #send_msg(self.target, str(e) + " | url: " + url, self.api_key) #raise e try: url_product = self.get_url_product(shirt, url) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: url_image_lowq,url_image_q2,url_image_q3,url_image_q4,url_image_hq = self.get_img_urls(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: asin = self.get_asin(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: uuid = self.get_uuid(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e crawlingdate = datetime.datetime.now() # append to general crawler df_products = pd.DataFrame(data={"title":[title],"brand":[brand],"url_product":[url_product],"url_image_lowq":[url_image_lowq],"url_image_hq":[url_image_hq],"price":[price],"asin":[asin],"uuid":[uuid], "timestamp":[crawlingdate]}) df_mba_images = pd.DataFrame(data={"asin":[asin],"url_image_lowq":[url_image_lowq],"url_image_q2":[url_image_q2], "url_image_q3":[url_image_q3], "url_image_q4":[url_image_q4],"url_image_hq":[url_image_hq], "timestamp":[crawlingdate]}) shirt_number = int(shirt_number_page + ((int(page)-1)*self.shirts_per_page)) df_mba_relevance = pd.DataFrame(data={"asin":[asin],"sort":[self.sort],"number":[shirt_number],"timestamp":[crawlingdate]}) self.df_products = self.df_products.append(df_products) self.df_mba_images = self.df_mba_images.append(df_mba_images) self.df_mba_relevance = self.df_mba_relevance.append(df_mba_relevance) # crawl only image if not already crawled if asin not in self.products_images_already_downloaded: image_urls.append(url_image_hq) asins.append(asin) url_mba_lowqs.append(url_image_lowq) # crawl images image_item = MbaCrawlerItem() image_item["image_urls"] = image_urls image_item["asins"] = asins image_item["url_mba_lowqs"] = url_mba_lowqs image_item["marketplace"] = self.marketplace if self.marketplace in ["com", "de"]: yield image_item self.page_count = self.page_count + 1 self.status_update() #url_next = "/".join(url.split("/")[0:3]) + response.css("ul.a-pagination li.a-last a::attr(href)").get() '''
def closed(self, reason): try: self.reset_ban.cancel() except Exception as e: send_msg(self.target, "Could not cancel ban reset function", self.api_key) print("Could not cancel ban reset function", str(e)) try: ip_dict = { i: self.ip_addresses.count(i) for i in self.ip_addresses } ip_addr_str = "" for ip, count in ip_dict.items(): ip_addr_str = "{}{}: {}\n".format(ip_addr_str, ip, count) proxy_str = "" for proxy, data in self.was_banned.items(): proxy_str = "{}{}: {}\n".format(proxy_str, proxy, data[0]) print(ip_addr_str) print(proxy_str) ip_addresses_str = "\n".join(list(set(self.ip_addresses))) #send_msg(self.target, "Used ip addresses: \n{}".format(ip_addr_str), self.api_key) #send_msg(self.target, "Ban count proxies: \n{}".format(proxy_str), self.api_key) print(proxy_str) print("Used ip addresses: \n{}".format(ip_addr_str)) print("Ban count proxies: \n{}".format(proxy_str)) #send_msg(self.target, "Captcha response count: {}".format(self.captcha_count), self.api_key) except: pass send_msg( self.target, "Finished scraper {} daily {} with {} products and reason: {}". format(self.name, self.daily, len(self.df_products_details_daily), reason), self.api_key) print("Finished scraper {} daily {} with {} products and reason: {}". format(self.name, self.daily, len(self.df_products_details_daily), reason)) if not self.daily: # change types to fit with big query datatypes self.df_products_details['color_count'] = self.df_products_details[ 'color_count'].astype('int') self.df_products_details['timestamp'] = self.df_products_details[ 'timestamp'].astype('datetime64[ns]') self.df_products_details['upload_date'] = self.df_products_details[ 'upload_date'].astype('datetime64[ns]') self.df_products_details[ 'customer_review_count'] = self.df_products_details[ 'customer_review_count'].astype('int') # change types of daily dataframe self.df_products_details_daily[ 'timestamp'] = self.df_products_details_daily['timestamp'].astype( 'datetime64[ns]') self.df_products_details_daily['bsr'] = self.df_products_details_daily[ 'bsr'].astype('int') self.df_products_details_daily[ 'customer_review_count'] = self.df_products_details_daily[ 'customer_review_count'].astype('int') # update data in bigquery if batch is finished if not self.daily: try: self.df_products_details.to_gbq("mba_" + self.marketplace + ".products_details", project_id="mba-pipeline", if_exists="append") except: time.sleep(10) try: self.df_products_details.to_gbq("mba_" + self.marketplace + ".products_details", project_id="mba-pipeline", if_exists="append") except: self.store_df() try: self.df_products_details_daily.to_gbq("mba_" + self.marketplace + ".products_details_daily", project_id="mba-pipeline", if_exists="append") except: time.sleep(10) try: self.df_products_details_daily.to_gbq( "mba_" + self.marketplace + ".products_details_daily", project_id="mba-pipeline", if_exists="append") except: self.store_df() if not self.df_products_no_mba_shirt.empty: self.df_products_no_mba_shirt.to_gbq("mba_" + self.marketplace + ".products_no_mba_shirt", project_id="mba-pipeline", if_exists="append") print(self.df_products_no_bsr) if not self.df_products_no_bsr.empty: self.df_products_no_bsr.to_gbq("mba_" + self.marketplace + ".products_no_bsr", project_id="mba-pipeline", if_exists="append")
def parse(self, response): asin = response.meta["asin"] proxy = self.get_proxy(response) url = response.url #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key) if self.is_captcha_required(response): #self.response_is_ban(request, response, is_ban=True) print("Captcha required for proxy: " + proxy) self.captcha_count = self.captcha_count + 1 self.update_ban_count(proxy) #send_msg(self.target, "Captcha: " + url, self.api_key) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"asin": asin}) yield request ''' raise Exception("Captcha required") send_msg(self.target, "Captcha required" + " | asin: " + asin, self.api_key) self.captcha_count = self.captcha_count + 1 # add download dely if captcha happens self.settings.attributes["DOWNLOAD_DELAY"].value = self.settings.attributes["DOWNLOAD_DELAY"].value + 3 if self.captcha_count > self.settings.attributes["MAX_CAPTCHA_NUMBER"].value: raise CloseSpider(reason='To many catchas received') raise Exception("Captcha required") ''' # do not proceed if its not a mba shirt elif not self.is_mba_shirt(response): self.df_products_no_mba_shirt = self.df_products_no_mba_shirt.append( pd.DataFrame( data={ "asin": [asin], "url": [url], "timestamp": [datetime.datetime.now()] })) else: self.ip_addresses.append(response.ip_address.compressed) try: price_str, price = self.get_price(response) except Exception as e: #self.save_content(response, asin) #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) price_str, price = "", 0.0 try: mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = self.get_bsr( response) except Exception as e: self.save_content(response, asin) #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) if "no bsr" in str(e): self.df_products_no_bsr = self.df_products_no_bsr.append( pd.DataFrame( data={ "asin": [asin], "url": [url], "timestamp": [datetime.datetime.now()] })) if self.daily: raise e else: # Cases exists like https://www.amazon.com/dp/B0855BCBZ6, which should have BSR but dont contain it on html # Therefore, we want to crawl it just once (if not daily crawl) mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = "", 0, [], [] try: customer_review_score_mean, customer_review_score, customer_review_count = self.get_customer_review( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e # if not daily crawler more data of website need to be crawled if not self.daily: try: title = self.get_title(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: brand, url_brand = self.get_brand_infos(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: fit_types = self.get_fit_types(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: array_color_names, color_count = self.get_color_infos( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: array_product_feature = self.get_product_features(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: description = self.get_description(response) except Exception as e: #self.save_content(response, asin) #send_msg(self.target, str(e) + "| asin: " + asin, self.api_key) #raise e description = "" try: weight = self.get_weight(response) except Exception as e: weight = "not found" self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) try: upload_date_str, upload_date = self.get_upload_date( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e crawlingdate = datetime.datetime.now() if not self.daily: # append to general crawler df = pd.DataFrame( data={ "asin": [asin], "title": [title], "brand": [brand], "url_brand": [url_brand], "price": [price_str], "fit_types": [fit_types], "color_names": [array_color_names], "color_count": [color_count], "product_features": [array_product_feature], "description": [description], "weight": [weight], "upload_date_str": [upload_date_str], "upload_date": [upload_date], "customer_review_score": [customer_review_score], "customer_review_count": [customer_review_count], "mba_bsr_str": [mba_bsr_str], "mba_bsr": [array_mba_bsr], "mba_bsr_categorie": [array_mba_bsr_categorie], "timestamp": [crawlingdate] }) self.df_products_details = self.df_products_details.append(df) # append to daily crawler df = pd.DataFrame( data={ "asin": [asin], "price": [price], "price_str": [price_str], "bsr": [mba_bsr], "bsr_str": [mba_bsr_str], "array_bsr": [array_mba_bsr], "array_bsr_categorie": [array_mba_bsr_categorie], "customer_review_score_mean": [customer_review_score_mean], "customer_review_score": [customer_review_score], "customer_review_count": [customer_review_count], "timestamp": [crawlingdate] }) self.df_products_details_daily = self.df_products_details_daily.append( df) self.status_update()