コード例 #1
0
class Swiggy:
    def __init__(self):
        self.CONFIG_FILE = "jwt-config.json"
        with open(self.CONFIG_FILE, 'r') as config_file:
            self.data_set = json.load(config_file)

        self.starter_config = self.data_set['SWIGGY']['CONFIG']['STARTER']

        self.config = self.data_set['SWIGGY']['CONFIG']
        self.dishes_data = []
        self.restaurants_data = []
        self.restaurants_obj = Restaurant(self.config)
        self.city = os.getenv('CITY', "jaipur")
        self.country = os.getenv('COUNTRY', "india")
        self.city_code = self.city + '__' + self.country
        self.dish_obj = Dish(self.config, self.city, self.country,
                             self.city_code)

    def get_data(self, url=None):

        print('+++++++IT WILL RUN')

        URL = self.starter_config["URLS"]['jaipur']
        r = requests.get(URL)

        soup = BeautifulSoup(r.content, 'html5lib')

        TAG = self.starter_config['SELECTORS']['WAIT']['TAG']
        FIND_BY = self.starter_config['SELECTORS']['WAIT']['FIND_BY']
        VALUE = self.starter_config['SELECTORS']['WAIT']['VALUE']
        data = soup.find_all(TAG, attrs={FIND_BY: VALUE})

        _list = []
        for row in data:
            link = row['href']
            subzone = row.text
            _list.append({
                'subzone': subzone,
                'link': "https://www.swiggy.com" + link
            })

        no_of_threads = 3
        last_chunk = -1
        subzone_batch_threads = []
        length_of_subzones = len(_list)
        chunk_size = int(length_of_subzones / no_of_threads)
        print('************chunk size', chunk_size)

        if length_of_subzones >= no_of_threads:
            for i in range(no_of_threads):

                batch = _list[i * chunk_size:(i + 1) * chunk_size]
                # batch = _list[0:1]
                subzone_batch_threads.append(
                    threading.Thread(target=self.get_restaurants_thread,
                                     args=(batch, )))
                subzone_batch_threads[-1].start()
                last_chunk = i

            last_chunk += 1
            if no_of_threads * chunk_size < length_of_subzones:

                batch = _list[last_chunk * chunk_size:length_of_subzones]
                # batch = _list[0:1]
                subzone_batch_threads.append(
                    threading.Thread(target=self.get_restaurants_thread,
                                     args=(batch, )))
                subzone_batch_threads[-1].start()

            for thread in subzone_batch_threads:
                print('\n\n**************JOINING***********')
                thread.join()
        else:
            for subzone in _list:
                self.restaurants_obj.get_restaurants(subzone['link'],
                                                     subzone['subzone'],
                                                     self.restaurants_data)

        self.get_dishes()
        print('++++DISHES LEN', len(self.dishes_data))

        # load data to dynamodb
        self.dynamodb_batch_write_obj = DynamoDBBatchWrite()
        self.dynamodb_batch_write_obj.batch_write_to_ddb(self.dishes_data)

        for dish_data in self.dishes_data:
            dish_data['stars'] = float(dish_data['stars'])

        # write data to parquet in s3
        self.write_to_s3_parquet_obj = WriteS3Parquet(self.city)
        self.write_to_s3_parquet_obj.write_to_parquet(self.dishes_data)

    def get_restaurants_thread(self, subzones):
        print('+++++=THREAD')
        for subzone in subzones:
            self.restaurants_obj.get_restaurants(subzone['link'],
                                                 subzone['subzone'],
                                                 self.restaurants_data)

    def get_dishes(self):
        no_of_threads = 3
        last_chunk = -1
        restaurant_batch_threads = []
        length_of_restaurants_data = len(self.restaurants_data)
        chunk_size = int(length_of_restaurants_data / no_of_threads)
        for i in range(no_of_threads):

            batch = self.restaurants_data[i * chunk_size:(i + 1) * chunk_size]
            # batch = restaurants_data[0:2]
            restaurant_batch_threads.append(
                threading.Thread(target=self.get_dishes_thread,
                                 args=(batch, )))
            restaurant_batch_threads[-1].start()
            last_chunk = i

        last_chunk += 1
        if no_of_threads * chunk_size < length_of_restaurants_data:

            batch = self.restaurants_data[
                last_chunk * chunk_size:length_of_restaurants_data]
            # batch = restaurants_data[0:2]
            restaurant_batch_threads.append(
                threading.Thread(target=self.get_dishes_thread,
                                 args=(batch, )))
            restaurant_batch_threads[-1].start()

        for thread in restaurant_batch_threads:
            thread.join()

    def get_dishes_thread(self, restaurants):

        for restaurant in restaurants:
            self.dish_obj.get_details(restaurant, self.dishes_data)
コード例 #2
0
class Zomato:
    def __init__(self):
        self.CONFIG_FILE = "jwt-config.json"
        with open(self.CONFIG_FILE, 'r') as config_file:
            self.data_set = json.load(config_file)

        self.starter_config = self.data_set['ZOMATO']['CONFIG']['STARTER']

        self.config = self.data_set['ZOMATO']['CONFIG']
        self.dishes_data = []
        self.restaurants_data = []
        self.restaurants_obj = Restaurant(self.config)
        self.city = os.getenv('CITY', "jaipur")
        self.country = os.getenv('COUNTRY', "india")
        self.city_code = self.city + '__' + self.country
        self.dish_obj = Dish(self.config, self.city, self.country,
                             self.city_code)

    def get_data(self, url=None):

        URL = self.starter_config["URLS"][self.city]
        print('+++++++++city url ff', self.city, URL)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0'
        }
        r = requests.get(URL, headers=headers)

        soup = BeautifulSoup(r.content, 'html5lib')

        TAG = self.starter_config['SELECTORS']['WAIT']['TAG']
        FIND_BY = self.starter_config['SELECTORS']['WAIT']['FIND_BY']
        VALUE = self.starter_config['SELECTORS']['WAIT']['VALUE']
        data = soup.find_all(TAG, attrs={FIND_BY: VALUE})

        _list = []
        for row in data:
            link = row['href']
            subzone = row.text.strip().split(' ')
            index = subzone.index('')
            subzone = subzone[0:index]
            subzone = " ".join(subzone)
            # print('++++++subzone',subzone)
            _list.append({
                'subzone':
                subzone,
                'link':
                "https://www.zomato.com/" + self.city + "/delivery-in-" +
                subzone.lower().replace(' ', '-') + "?ref_page=subzone"
            })

        print(" list of subzones ", len(_list))

        # _list = _list[:1]
        no_of_threads = 3
        last_chunk = -1
        subzone_batch_threads = []
        length_of_subzones = len(_list)
        chunk_size = int(length_of_subzones / no_of_threads)
        print('************chunk size', chunk_size)

        if length_of_subzones >= no_of_threads:
            for i in range(no_of_threads):

                batch = _list[i * chunk_size:(i + 1) * chunk_size]
                # batch = _list[0:1]
                subzone_batch_threads.append(
                    threading.Thread(target=self.get_restaurants_thread,
                                     args=(batch, )))
                subzone_batch_threads[-1].start()
                last_chunk = i

            last_chunk += 1
            if no_of_threads * chunk_size < length_of_subzones:

                batch = _list[last_chunk * chunk_size:length_of_subzones]
                # batch = _list[0:1]
                subzone_batch_threads.append(
                    threading.Thread(target=self.get_restaurants_thread,
                                     args=(batch, )))
                subzone_batch_threads[-1].start()

            for thread in subzone_batch_threads:
                print('\n\n**************JOINING***********')
                thread.join()
        else:
            for subzone in _list:
                self.restaurants_obj.get_restaurants(subzone['link'],
                                                     subzone['subzone'],
                                                     self.restaurants_data)

        self.get_dishes()
        print('++++DISHES LEN', len(self.dishes_data))

        # # load data to dynamodb
        self.dynamodb_batch_write_obj = DynamoDBBatchWrite()
        self.dynamodb_batch_write_obj.batch_write_to_ddb(self.dishes_data)

        for dish_data in self.dishes_data:
            dish_data['stars'] = float(dish_data['stars'])

        # # write data to parquet in s3
        self.write_to_s3_parquet_obj = WriteS3Parquet(self.city)
        self.write_to_s3_parquet_obj.write_to_parquet(self.dishes_data)

    def get_restaurants_thread(self, subzones):
        print('+++++=THREAD')
        for subzone in subzones:
            self.restaurants_obj.get_restaurants(subzone['link'],
                                                 subzone['subzone'],
                                                 self.restaurants_data)

    def get_dishes(self):

        no_of_threads = 2
        last_chunk = -1
        restaurant_batch_threads = []
        # self.restaurants_data = self.restaurants_data[3:8]
        length_of_restaurants_data = len(self.restaurants_data)
        chunk_size = int(length_of_restaurants_data / no_of_threads)
        for i in range(no_of_threads):

            batch = self.restaurants_data[i * chunk_size:(i + 1) * chunk_size]
            # batch = restaurants_data[0:2]
            restaurant_batch_threads.append(
                threading.Thread(target=self.get_dishes_thread,
                                 args=(batch, )))
            restaurant_batch_threads[-1].start()
            last_chunk = i

        last_chunk += 1
        if no_of_threads * chunk_size < length_of_restaurants_data:

            batch = self.restaurants_data[
                last_chunk * chunk_size:length_of_restaurants_data]
            # batch = restaurants_data[0:2]
            restaurant_batch_threads.append(
                threading.Thread(target=self.get_dishes_thread,
                                 args=(batch, )))
            restaurant_batch_threads[-1].start()

        for thread in restaurant_batch_threads:
            thread.join()

    def get_dishes_thread(self, restaurants):

        for restaurant in restaurants:

            try:
                options = Options()
                options.headless = True
                driver = webdriver.Firefox(options=options)
                # driver.set_page_load_timeout(10)
                self.dish_obj.get_details(driver, restaurant, self.dishes_data)
                driver.close()
            except Exception as e:
                print('++++++error while getting dish', e)
            try:
                driver.close()
            except:
                pass