Beispiel #1
0
    def db_insert_reviews(reviews):
        for review in reviews:
            try:
                ZomatoReviews.insert(review)
                messege = "Review  with review_id: %s  has been updated successfully" % (
                    review["review_id"])
                print_messege("success", messege,
                              inspect.stack()[0][3], None, review["eatery_id"],
                              None, review["review_id"], FILE)

            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                error = repr(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                messege = "Review  with review_id: %s  failed" % (
                    review["review_id"])
                print_messege("error", messege,
                              inspect.stack()[0][3], error,
                              review["eatery_id"], None, review["review_id"],
                              FILE)
                pass

        eatery_id = review.get("eatery_id")
        return
        def deco(self, review):
            try:
                return func(self, review)

            except ValueError as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                error = repr(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                print_messege("error",
                              "error occurred",
                              func.__name__,
                              error,
                              self.eatery_id,
                              self.eatery_url,
                              None,
                              module_name=FILE)
                return None
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                error = repr(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                print_messege("error",
                              "error occurred",
                              func.__name__,
                              error,
                              self.eatery_id,
                              self.eatery_url,
                              None,
                              module_name=FILE)
                return None
Beispiel #3
0
    def db_insert_eateries(eatery):
        # db = client.modified_canworks  I think needed for bulk
        try:
            ZomatoEateries.update({"eatery_id": eatery.get("eatery_id")},
                                  {"$set": eatery},
                                  upsert=True)
            messege = "Eatery with eatery_id: %s  and eatery_name: %s has been updated successfully" % (
                eatery["eatery_id"], eatery["eatery_name"])
            try:

                print_messege("success", messege,
                              inspect.stack()[0][3], None, eatery["eatery_id"],
                              eatery["eatery_url"], None, FILE)
            except Exception as e:
                print e
        except Exception as e:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            error = repr(
                traceback.format_exception(exc_type, exc_value, exc_traceback))
            messege = "Eatery with eatery_id: %s  and eatery_name: %s failed" % (
                eatery["eatery_id"], eatery["eatery_name"])
            try:

                print_messege("error", messege,
                              inspect.stack()[0][3], error,
                              eatery["eatery_id"], eatery["eatery_url"], None,
                              FILE)
            except Exception as e:
                print e
        return
Beispiel #4
0
    def db_insert_users(reviews):
        for review in reviews:
            try:
                result = ZomatoUsers.update({"user_id": review.get("user_id"), "user_name": review.get("user_name")},{"$set": \
                                                    {"user_url": review.get("user_url"), "user_followers": review.get("user_followers"), "user_reviews" : \
                                                    review.get("user_reviews"), "updated_on": int(time.time())}}, upsert=True)
                messege = "User with user_id: %s  and user_name: %s has been updated successfully" % (
                    review["user_id"], review["user_name"])
                print_messege("success", messege,
                              inspect.stack()[0][3], None, review["eatery_id"],
                              None, review["review_id"], FILE)

            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                error = repr(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                messege = "User  with user_id: %s  failed" % (
                    review["user_id"])
                print_messege("error", messege,
                              inspect.stack()[0][3], error,
                              review["eatery_id"], None, review["review_id"],
                              FILE)
                pass

        return
Beispiel #5
0
        def run(self, eatery_dict, flush_eatery=False):
		self.eatery_dict = eatery_dict
	        self.start = time.time()

            
                if flush_eatery:
                        DBInsert.flush_eatery(self.eatery_dict["eatery_url"])


                logger.info("{fg} {bg}Starting eatery_url --<{url}>-- of task --<{task_name}>-- with time taken\
                        --<{time}>-- seconds  {reset}".format(fg=fg('white'), bg=bg('green'), \
                        url=eatery_dict["eatery_url"], task_name= self.__class__.__name__,
                            time=time.time() -self.start, reset=attr('reset')))
		
                __instance = EateryData(eatery_dict)

		eatery_dict, reviewslist = __instance.run()
                     
		DBInsert.db_insert_eateries(eatery_dict)
		DBInsert.db_insert_reviews(reviewslist)
		DBInsert.db_insert_users(reviewslist)
                
                reviews_in_db =DBInsert. db_get_reviews_eatery(eatery_dict["eatery_id"])
                if reviews_in_db !=  int(eatery_dict['eatery_total_reviews']):
                        messege = "Umatched reviews: present in DB %s and should be %s"%(reviews_in_db,  int(eatery_dict['eatery_total_reviews']))
                        print_messege("error", messege, "ScrapeEachEatery.run", None, eatery_dict["eatery_id"], eatery_dict["eatery_url"], None, module_name=FILE)
                        if not reviews_in_db -10 >= int(eatery_dict['eatery_total_reviews']) >= reviews_in_db + 10:
                        	r.hset(eatery_dict["eatery_url"], "unmatched_reviews", messege)
                        	r.hset(eatery_dict["eatery_url"], "total_reviews",  int(eatery_dict['eatery_total_reviews']))
                        	r.hset(eatery_dict["eatery_url"], "reviews_in_db", reviews_in_db)
                                r.hset(eatery_dict["eatery_url"], "error_cause", "zomato incompetency")
                                r.hset(eatery_dict["eatery_url"], "frequency", reviews_in_db -  int(eatery_dict['eatery_total_reviews']))
        
                print eatery_dict
                eatery_id, __eatery_id, eatery_name, eatery_photo_link, location = eatery_dict["eatery_id"], eatery_dict["__eatery_id"], \
                                                    eatery_dict["eatery_name"], eatery_dict["eatery_photo_link"], eatery_dict["location"]
                
                
                print terminal.blue("Trying pics for eatery_id=<<%s>>, __eatery_id=<<%s>>, eatery_photo_link=<<%s>>"%(eatery_id, __eatery_id, eatery_photo_link))
                try:
                        instance = GoogleNPics(eatery_id, __eatery_id, eatery_photo_link)
                        instance.run()    
                except Exception as e:
                        print terminal.red("error occurred saving in redis %s for eatery_id %s"%(str(e), eatery_dict["eatery_id"]))
                        r_pics.hset(eatery_id, "error",  str(e))

                google = find_google_places(eatery_id, __eatery_id, eatery_name, location)
                print google
                return
    def get_reviews(self):
        if config.getboolean("proxy", "use_proxy"):
            if DRIVER_NAME == "PhantomJS":
                service_args = [config.get("proxy", "service_args")]
                driver = webdriver.PhantomJS(service_args=service_args)
                driver.get(self.eatery["eatery_url"])
            else:
                chrome_options = webdriver.ChromeOptions()
                chrome_options.add_argument('--proxy-server=%s' %
                                            PROXY_ADDRESS)
                driver = webdriver.Chrome(driver_exec_path,
                                          chrome_options=chrome_options)
                driver.get(self.eatery["eatery_url"])

        else:
            if DRIVER_NAME == "PhantomJS":
                driver = webdriver.PhantomJS()
                driver.get(self.eatery["eatery_url"])
            else:
                driver = webdriver.Chrome(driver_exec_path)
                driver.get(self.eatery["eatery_url"])

        if driver.title.startswith("404"):
            raise StandardError("This url doesnt exists, returns 404 error")
        time.sleep(50)
        try:
            driver.find_element_by_css_selector("a.everyone.empty").click()
            time.sleep(20)
            driver.find_element_by_css_selector("a.everyone.empty").click()
            driver.find_element_by_css_selector("a.everyone.empty").click()
            driver.find_element_by_css_selector("a.everyone.empty").click()
            driver.find_element_by_css_selector("a.everyone.empty").click()
            driver.find_element_by_css_selector("a.everyone.empty").click()
            print "{start_color} Found love in clinking all_review button :)  {end_color}".format(\
                                         start_color=bcolors.OKGREEN, end_color=bcolors.RESET)

        except NoSuchElementException:
            print "{start_color} ERROR: Couldnt not clicked on all review button {end_color}".format(\
                    start_color=bcolors.FAIL, end_color=bcolors.RESET)

            pass

        time.sleep(10)

        try:
            reviews_to_be_scraped = int(
                self.eatery["eatery_total_reviews"]) - int(self.reviews_inDB)
            #reviews_to_be_scraped = int(self.eatery["eatery_total_reviews"]) - int(self.previous_total_reviews)
            print "{start_color} No. of reviews to be scraped {number}{end_color}".format(\
                    start_color=bcolors.OKGREEN, number=reviews_to_be_scraped, end_color=bcolors.RESET)

            print "{start_color} No. of reviews present in the DB  {number}{end_color}".format(\
                                         start_color=bcolors.OKGREEN, number=int(self.reviews_inDB), end_color=bcolors.RESET)

            print "{start_color} No. of reviews that were eariler on the page  {number}{end_color}".format(\
                    start_color=bcolors.OKGREEN, number=int(self.previous_total_reviews), end_color=bcolors.RESET)

        except TypeError as e:
            print_messege("error", "total reviews key error", "EateryData.get_reviews", e, self.eatery["eatery_id"],\
            self.eatery["eatery_url"], None)
            return

        def retry_if_standard_error(exception):
            """Return True if we should retry (in this case when it's an StandardError), False otherwise"""
            return isinstance(exception, StandardError)

        @retry(retry_on_result=retry_if_standard_error,
               wait_fixed=10000,
               stop_max_attempt_number=5)
        def run_load_more():
            try:

                print "Click on loadmore <<{value}>> time".format(
                    start_color=bcolors.OKBLUE,
                    value=i,
                    end_color=bcolors.RESET)
                ##time.sleep(random.choice([2, 3]))
                driver.find_element_by_class_name("load-more").click()
                time.sleep(1)

            except NoSuchElementException as e:
                print "{color} ERROR: Catching Exception -<{error}>- with messege -<No More Loadmore tag present>- {reset}".format(
                    color=bcolors.OKGREEN, error=e, reset=bcolors.RESET)
                pass

            except urllib2.URLError:
                driver.quit()
                #raise StandardError("Could not make the request")
                pass

            except WebDriverException:
                driver.quit()
                #raise StandardError("Could not make the request")
                pass

            except Exception as e:
                print e
                pass

        if ZomatoReviewsCollection.find({
                "eatery_id": self.eatery["eatery_id"]
        }).count() == 0:

            for i in range(0, reviews_to_be_scraped / 5 + 500):
                run_load_more()

        else:
            for i in range(0, reviews_to_be_scraped / 5 + 60):
                run_load_more()

        read_more_links = driver.find_elements_by_xpath(
            "//div[@class='rev-text-expand']")
        read_more_count = range(0, len(read_more_links))[::-1]

        time.sleep(10)
        for link, __count in zip(read_more_links, read_more_count):
            print "Click on read_more  <<{value}>>  <<{count}>>time".format(
                start_color=bcolors.OKBLUE,
                value=link,
                count=__count,
                end_color=bcolors.RESET)
            time.sleep(random.choice([1, 2]))
            link.click()

        html = driver.page_source
        content = html.encode('ascii', 'ignore').decode('ascii')
        driver.quit()
        return BeautifulSoup.BeautifulSoup(content)
    def run(self):

        print "Found eatery_id==<<{eatery_id}>> and eatery_name==<<{eatery_name}>> time".format(
            start_color=bcolors.OKGREEN,
            eatery_id=self.eatery["eatery_id"],
            eatery_name=self.eatery["eatery_name"].encode("ascii", "ignore"),
            end_color=bcolors.RESET)
        __hash = hashlib.sha256(self.eatery["eatery_id"] +
                                self.eatery["eatery_url"]).hexdigest()

        self.eatery.update({"__eatery_id": __hash})

        self.reviews_inDB = review_collection.find({
            "eatery_id":
            self.eatery["eatery_id"]
        }).count()
        try:
            self.previous_total_reviews = eatery_collection.find_one({
                "eatery_id":
                self.eatery["eatery_id"]
            }).get("eatery_total_reviews")
        except Exception as e:
            print "Eatery easnt present earlier so setting previous total reviews to 0"
            print e
            self.previous_total_reviews = 0

        messege = "Number of reviews present in the database %s" % self.reviews_inDB
        print_messege("info", messege, "EateryData.run", None, self.eatery["eatery_id"],\
                self.eatery["eatery_url"], None, FILE)

        process_result(self.eatery, "eatery_cost",
                       FILE)(self.retry_eatery_cost)()
        process_result(self.eatery, "eatery_trending",
                       FILE)(self.retry_eatery_trending)()
        process_result(self.eatery, "eatery_rating",
                       FILE)(self.retry_eatery_rating)()
        process_result(self.eatery, "eatery_cuisine",
                       FILE)(self.retry_eatery_cuisine)()
        process_result(self.eatery, "eatery_highlights",
                       FILE)(self.eatery_highlights)()
        process_result(self.eatery, "eatery_popular_reviews",
                       FILE)(self.eatery_popular_reviews)()

        process_result(self.eatery, "location",
                       FILE)(self.eatery_longitude_latitude)()
        process_result(self.eatery, "eatery_total_reviews",
                       FILE)(self.eatery_total_reviews)()
        process_result(self.eatery, "eatery_buffet_price",
                       FILE)(self.eatery_buffet_price)()
        process_result(self.eatery, "eatery_buffet_details",
                       FILE)(self.eatery_buffet_details)()

        process_result(self.eatery, "eatery_recommended_order",
                       FILE)(self.eatery_recommended_order)()
        process_result(self.eatery, "eatery_known_for",
                       FILE)(self.eatery_known_for)()
        process_result(self.eatery, "eatery_area_or_city",
                       FILE)(self.eatery_area_or_city)()

        process_result(self.eatery, "eatery_opening_hours",
                       FILE)(self.eatery_opening_hours)()

        process_result(self.eatery, "eatery_photo_link",
                       FILE)(self.eatery_photo_link)()
        process_result(self.eatery, "eatery_update_on",
                       FILE)(self.eatery_update_on)()

        assert (self.eatery["location"] != None)

        review_soup = self.get_reviews()

        #self.last_no_of_reviews_to_be_scrapped = int(self.no_of_reviews_to_be_scrapped) - int(no_of_blogs)
        ins = ZomatoReviews(review_soup, self.eatery["eatery_area_or_city"],
                            self.eatery["eatery_id"],
                            self.eatery["eatery_url"])
        return (self.eatery, ins.reviews_data)