def db_insert_reviews(reviews): for review in reviews: try: ZomatoReviews.insert(review) messege = "Review with review_id: %s has been updated successfully" % ( review["review_id"]) print_messege("success", messege, inspect.stack()[0][3], None, review["eatery_id"], None, review["review_id"], FILE) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() error = repr( traceback.format_exception(exc_type, exc_value, exc_traceback)) messege = "Review with review_id: %s failed" % ( review["review_id"]) print_messege("error", messege, inspect.stack()[0][3], error, review["eatery_id"], None, review["review_id"], FILE) pass eatery_id = review.get("eatery_id") return
def deco(self, review): try: return func(self, review) except ValueError as e: exc_type, exc_value, exc_traceback = sys.exc_info() error = repr( traceback.format_exception(exc_type, exc_value, exc_traceback)) print_messege("error", "error occurred", func.__name__, error, self.eatery_id, self.eatery_url, None, module_name=FILE) return None except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() error = repr( traceback.format_exception(exc_type, exc_value, exc_traceback)) print_messege("error", "error occurred", func.__name__, error, self.eatery_id, self.eatery_url, None, module_name=FILE) return None
def db_insert_eateries(eatery): # db = client.modified_canworks I think needed for bulk try: ZomatoEateries.update({"eatery_id": eatery.get("eatery_id")}, {"$set": eatery}, upsert=True) messege = "Eatery with eatery_id: %s and eatery_name: %s has been updated successfully" % ( eatery["eatery_id"], eatery["eatery_name"]) try: print_messege("success", messege, inspect.stack()[0][3], None, eatery["eatery_id"], eatery["eatery_url"], None, FILE) except Exception as e: print e except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() error = repr( traceback.format_exception(exc_type, exc_value, exc_traceback)) messege = "Eatery with eatery_id: %s and eatery_name: %s failed" % ( eatery["eatery_id"], eatery["eatery_name"]) try: print_messege("error", messege, inspect.stack()[0][3], error, eatery["eatery_id"], eatery["eatery_url"], None, FILE) except Exception as e: print e return
def db_insert_users(reviews): for review in reviews: try: result = ZomatoUsers.update({"user_id": review.get("user_id"), "user_name": review.get("user_name")},{"$set": \ {"user_url": review.get("user_url"), "user_followers": review.get("user_followers"), "user_reviews" : \ review.get("user_reviews"), "updated_on": int(time.time())}}, upsert=True) messege = "User with user_id: %s and user_name: %s has been updated successfully" % ( review["user_id"], review["user_name"]) print_messege("success", messege, inspect.stack()[0][3], None, review["eatery_id"], None, review["review_id"], FILE) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() error = repr( traceback.format_exception(exc_type, exc_value, exc_traceback)) messege = "User with user_id: %s failed" % ( review["user_id"]) print_messege("error", messege, inspect.stack()[0][3], error, review["eatery_id"], None, review["review_id"], FILE) pass return
def run(self, eatery_dict, flush_eatery=False): self.eatery_dict = eatery_dict self.start = time.time() if flush_eatery: DBInsert.flush_eatery(self.eatery_dict["eatery_url"]) logger.info("{fg} {bg}Starting eatery_url --<{url}>-- of task --<{task_name}>-- with time taken\ --<{time}>-- seconds {reset}".format(fg=fg('white'), bg=bg('green'), \ url=eatery_dict["eatery_url"], task_name= self.__class__.__name__, time=time.time() -self.start, reset=attr('reset'))) __instance = EateryData(eatery_dict) eatery_dict, reviewslist = __instance.run() DBInsert.db_insert_eateries(eatery_dict) DBInsert.db_insert_reviews(reviewslist) DBInsert.db_insert_users(reviewslist) reviews_in_db =DBInsert. db_get_reviews_eatery(eatery_dict["eatery_id"]) if reviews_in_db != int(eatery_dict['eatery_total_reviews']): messege = "Umatched reviews: present in DB %s and should be %s"%(reviews_in_db, int(eatery_dict['eatery_total_reviews'])) print_messege("error", messege, "ScrapeEachEatery.run", None, eatery_dict["eatery_id"], eatery_dict["eatery_url"], None, module_name=FILE) if not reviews_in_db -10 >= int(eatery_dict['eatery_total_reviews']) >= reviews_in_db + 10: r.hset(eatery_dict["eatery_url"], "unmatched_reviews", messege) r.hset(eatery_dict["eatery_url"], "total_reviews", int(eatery_dict['eatery_total_reviews'])) r.hset(eatery_dict["eatery_url"], "reviews_in_db", reviews_in_db) r.hset(eatery_dict["eatery_url"], "error_cause", "zomato incompetency") r.hset(eatery_dict["eatery_url"], "frequency", reviews_in_db - int(eatery_dict['eatery_total_reviews'])) print eatery_dict eatery_id, __eatery_id, eatery_name, eatery_photo_link, location = eatery_dict["eatery_id"], eatery_dict["__eatery_id"], \ eatery_dict["eatery_name"], eatery_dict["eatery_photo_link"], eatery_dict["location"] print terminal.blue("Trying pics for eatery_id=<<%s>>, __eatery_id=<<%s>>, eatery_photo_link=<<%s>>"%(eatery_id, __eatery_id, eatery_photo_link)) try: instance = GoogleNPics(eatery_id, __eatery_id, eatery_photo_link) instance.run() except Exception as e: print terminal.red("error occurred saving in redis %s for eatery_id %s"%(str(e), eatery_dict["eatery_id"])) r_pics.hset(eatery_id, "error", str(e)) google = find_google_places(eatery_id, __eatery_id, eatery_name, location) print google return
def get_reviews(self): if config.getboolean("proxy", "use_proxy"): if DRIVER_NAME == "PhantomJS": service_args = [config.get("proxy", "service_args")] driver = webdriver.PhantomJS(service_args=service_args) driver.get(self.eatery["eatery_url"]) else: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=%s' % PROXY_ADDRESS) driver = webdriver.Chrome(driver_exec_path, chrome_options=chrome_options) driver.get(self.eatery["eatery_url"]) else: if DRIVER_NAME == "PhantomJS": driver = webdriver.PhantomJS() driver.get(self.eatery["eatery_url"]) else: driver = webdriver.Chrome(driver_exec_path) driver.get(self.eatery["eatery_url"]) if driver.title.startswith("404"): raise StandardError("This url doesnt exists, returns 404 error") time.sleep(50) try: driver.find_element_by_css_selector("a.everyone.empty").click() time.sleep(20) driver.find_element_by_css_selector("a.everyone.empty").click() driver.find_element_by_css_selector("a.everyone.empty").click() driver.find_element_by_css_selector("a.everyone.empty").click() driver.find_element_by_css_selector("a.everyone.empty").click() driver.find_element_by_css_selector("a.everyone.empty").click() print "{start_color} Found love in clinking all_review button :) {end_color}".format(\ start_color=bcolors.OKGREEN, end_color=bcolors.RESET) except NoSuchElementException: print "{start_color} ERROR: Couldnt not clicked on all review button {end_color}".format(\ start_color=bcolors.FAIL, end_color=bcolors.RESET) pass time.sleep(10) try: reviews_to_be_scraped = int( self.eatery["eatery_total_reviews"]) - int(self.reviews_inDB) #reviews_to_be_scraped = int(self.eatery["eatery_total_reviews"]) - int(self.previous_total_reviews) print "{start_color} No. of reviews to be scraped {number}{end_color}".format(\ start_color=bcolors.OKGREEN, number=reviews_to_be_scraped, end_color=bcolors.RESET) print "{start_color} No. of reviews present in the DB {number}{end_color}".format(\ start_color=bcolors.OKGREEN, number=int(self.reviews_inDB), end_color=bcolors.RESET) print "{start_color} No. of reviews that were eariler on the page {number}{end_color}".format(\ start_color=bcolors.OKGREEN, number=int(self.previous_total_reviews), end_color=bcolors.RESET) except TypeError as e: print_messege("error", "total reviews key error", "EateryData.get_reviews", e, self.eatery["eatery_id"],\ self.eatery["eatery_url"], None) return def retry_if_standard_error(exception): """Return True if we should retry (in this case when it's an StandardError), False otherwise""" return isinstance(exception, StandardError) @retry(retry_on_result=retry_if_standard_error, wait_fixed=10000, stop_max_attempt_number=5) def run_load_more(): try: print "Click on loadmore <<{value}>> time".format( start_color=bcolors.OKBLUE, value=i, end_color=bcolors.RESET) ##time.sleep(random.choice([2, 3])) driver.find_element_by_class_name("load-more").click() time.sleep(1) except NoSuchElementException as e: print "{color} ERROR: Catching Exception -<{error}>- with messege -<No More Loadmore tag present>- {reset}".format( color=bcolors.OKGREEN, error=e, reset=bcolors.RESET) pass except urllib2.URLError: driver.quit() #raise StandardError("Could not make the request") pass except WebDriverException: driver.quit() #raise StandardError("Could not make the request") pass except Exception as e: print e pass if ZomatoReviewsCollection.find({ "eatery_id": self.eatery["eatery_id"] }).count() == 0: for i in range(0, reviews_to_be_scraped / 5 + 500): run_load_more() else: for i in range(0, reviews_to_be_scraped / 5 + 60): run_load_more() read_more_links = driver.find_elements_by_xpath( "//div[@class='rev-text-expand']") read_more_count = range(0, len(read_more_links))[::-1] time.sleep(10) for link, __count in zip(read_more_links, read_more_count): print "Click on read_more <<{value}>> <<{count}>>time".format( start_color=bcolors.OKBLUE, value=link, count=__count, end_color=bcolors.RESET) time.sleep(random.choice([1, 2])) link.click() html = driver.page_source content = html.encode('ascii', 'ignore').decode('ascii') driver.quit() return BeautifulSoup.BeautifulSoup(content)
def run(self): print "Found eatery_id==<<{eatery_id}>> and eatery_name==<<{eatery_name}>> time".format( start_color=bcolors.OKGREEN, eatery_id=self.eatery["eatery_id"], eatery_name=self.eatery["eatery_name"].encode("ascii", "ignore"), end_color=bcolors.RESET) __hash = hashlib.sha256(self.eatery["eatery_id"] + self.eatery["eatery_url"]).hexdigest() self.eatery.update({"__eatery_id": __hash}) self.reviews_inDB = review_collection.find({ "eatery_id": self.eatery["eatery_id"] }).count() try: self.previous_total_reviews = eatery_collection.find_one({ "eatery_id": self.eatery["eatery_id"] }).get("eatery_total_reviews") except Exception as e: print "Eatery easnt present earlier so setting previous total reviews to 0" print e self.previous_total_reviews = 0 messege = "Number of reviews present in the database %s" % self.reviews_inDB print_messege("info", messege, "EateryData.run", None, self.eatery["eatery_id"],\ self.eatery["eatery_url"], None, FILE) process_result(self.eatery, "eatery_cost", FILE)(self.retry_eatery_cost)() process_result(self.eatery, "eatery_trending", FILE)(self.retry_eatery_trending)() process_result(self.eatery, "eatery_rating", FILE)(self.retry_eatery_rating)() process_result(self.eatery, "eatery_cuisine", FILE)(self.retry_eatery_cuisine)() process_result(self.eatery, "eatery_highlights", FILE)(self.eatery_highlights)() process_result(self.eatery, "eatery_popular_reviews", FILE)(self.eatery_popular_reviews)() process_result(self.eatery, "location", FILE)(self.eatery_longitude_latitude)() process_result(self.eatery, "eatery_total_reviews", FILE)(self.eatery_total_reviews)() process_result(self.eatery, "eatery_buffet_price", FILE)(self.eatery_buffet_price)() process_result(self.eatery, "eatery_buffet_details", FILE)(self.eatery_buffet_details)() process_result(self.eatery, "eatery_recommended_order", FILE)(self.eatery_recommended_order)() process_result(self.eatery, "eatery_known_for", FILE)(self.eatery_known_for)() process_result(self.eatery, "eatery_area_or_city", FILE)(self.eatery_area_or_city)() process_result(self.eatery, "eatery_opening_hours", FILE)(self.eatery_opening_hours)() process_result(self.eatery, "eatery_photo_link", FILE)(self.eatery_photo_link)() process_result(self.eatery, "eatery_update_on", FILE)(self.eatery_update_on)() assert (self.eatery["location"] != None) review_soup = self.get_reviews() #self.last_no_of_reviews_to_be_scrapped = int(self.no_of_reviews_to_be_scrapped) - int(no_of_blogs) ins = ZomatoReviews(review_soup, self.eatery["eatery_area_or_city"], self.eatery["eatery_id"], self.eatery["eatery_url"]) return (self.eatery, ins.reviews_data)