Beispiel #1
0
def Zomato_Scrape_info(URL, proxy):
    try:

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
        }
        ip_text = requests.get('http://icanhazip.com',
                               proxies=proxy,
                               headers=headers)
        print('Browsing from - ', ip_text.content.decode('utf-8'))
        rest_info = get_restaurant_info(URL, proxy)
        # print('Inside Function: ',rest_info)
        return rest_info

    except ConnectionError:  # If error, find another proxy

        proxy = random_proxy()
        return Zomato_Scrape_info(URL, proxy)
def get_geocode(url, proxy):
    try:

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
        }
        r = requests.get(url, proxies=proxy, headers=headers)
        soup = BeautifulSoup(r.content, 'html.parser')
        map_container = soup.find('div', {'class': 'ui segment map_container'})
        if map_container is None:
            return 'Not Available'
        geo_link = map_container.find('a')
        geocode = geo_link.get_attribute_list('href')[0].split('/')[-1]
        # print(geocode)
        return geocode

    except ConnectionError:
        print('ConnectionError inside get_geocode() method, changing IP')
        return get_geocode(url, random_proxy())
def get_img_urls_from(url, proxy):
    try:
        img_url_list = []
        headers = \
            {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
        response = requests.get(url, headers=headers, proxies=proxy)
        soup = BeautifulSoup(response.content, 'html.parser')
        images_divs_container = soup.find(
            'div', {'class': 'photos_container_load_more inlineblock w100'})
        image_divs = images_divs_container.findChildren('div', recursive=False)
        if not len(image_divs) > 0:
            return 'Not Available'
        for div in image_divs[:10]:
            image_url_div = div.find('img')
            # print(image_url_div)
            image_url = image_url_div.get_attribute_list('data-original')[0]
            image_url = image_url.split('?')[0]
            # print(image_url)
            img_url_list.append(image_url)
        return ','.join(img_url_list)

    except ConnectionError:
        print('ConnectionError inside get_img_urls_from() method, changing IP')
        return get_img_urls_from(url, random_proxy())
URLS_df = pd.read_csv(prop.Restaurant_master_file_path)
# os.chdir('E:\\DSA Internship\\Web Scrapping\\Zomato Restaurants Web Scrapping\\Cities\\Hyderabad\\Error Urls\\')
# Error_URLS=[]
# with open('ErrorURLs.txt','r') as f:
#     for URL in f.read().splitlines():
#         Error_URLS.append(URL)

# rest_urls_list = ['https://www.zomato.com/hyderabad/kebab-e-bahar-taj-banjara-banjara-hills',
#                    'https://www.zomato.com/hyderabad/36-downtown-brew-pub-jubilee-hills',
#                    'https://www.zomato.com/hyderabad/huber-holly-banjara-hills',
#                   'https://www.zomato.com/hyderabad/behrouz-biryani-banjara-hills',
#                   'https://www.zomato.com/hyderabad/seasonal-tastes-the-westin-hitech-city',
#                   'https://www.zomato.com/rolling-stove-food-truck']  # list of Restaurant URLs

proxy = random_proxy()

start = datetime.now()
#for url in tqdm(URLS_df[30:50].values):
database = connect_to_my_mongo_db('Restaurants')
restaurant_mongo_master_collection = database.Restaurants_Info_Master

start_index = int(input('Input the Start Index of the URLs: '))
stop_index = int(input('Input the Stop Index of the URLs: '))

n = start_index
for url in URLS_df[start_index:stop_index].values:
    Not_properly_scrapped = True
    print('Scrapping Restaurant Data of index ', n, ': ', url[0])

    while Not_properly_scrapped:
Beispiel #5
0
def scrape_reviews(start_index, stop_index):

    URLS_df = pd.read_csv(prop.Restaurant_master_file_path)
    # URLS=URLS_df['URLS'].tolist()
    ErrorURLs = pd.read_csv(prop.ErrorURLs_file_path)
    error_url_list = ErrorURLs['Error_URLS'].tolist()
    reviews_folder_path = prop.Reviews_folder_path

    start = datetime.now()
    proxy = random_proxy()
    total_reviews_count = 0

    #Mongo_DB Connection...
    db = connect_to_my_mongo_db('Restaurants')
    mongo_reviews = db['Reviews_Master']

    for row in URLS_df[start_index:stop_index].values:
        begin = datetime.now()
        url = row[0]  # row[0] for url and row[1] for Rest_ID
        if url in error_url_list:
            print(
                'The current URL had errors while scrapping its restuarant info, Hence skipping this url: ',
                url)
            continue

        Mongo_ref = get_mongo_rest_ref_by_url(db, 'Restaurants_Info_Master',
                                              url)
        rest_review_url = url + '/reviews'
        not_scrapped = True
        while not_scrapped:  # loop repeats till a working proxy IP is caught
            chrome_browser = Create_Chrome_browser(use_proxy=True,
                                                   proxy=proxy['http'])
            # print(url)
            reviews_df, success = scrape_reviews_from_url(
                chrome_browser, rest_review_url)

            if success:

                if len(reviews_df) == 0:
                    print('CSV file will not be generated...')
                    print('-' * 100, '\n')
                    not_scrapped = False
                    continue
                Rest_ID = str(row[1])
                file_name = url.split('/')[-1]
                #print(file_name)
                #https://stackoverflow.com/questions/29815129/pandas-dataframe-to-list-of-dictionaries

                reviews_dict_list = reviews_df.to_dict('records')
                for review in reviews_dict_list:
                    review['Restaurant_Ref'] = Mongo_ref
                    mongo_reviews.insert_one(review)

                csv_file_name = reviews_folder_path + Rest_ID + '-' + file_name + '.csv'
                #print(row[1])
                print('Restaurant ID:', Rest_ID)
                reviews_df[
                    'Restaurant_ID'] = Rest_ID  # Copying the string values of IDs
                reviews_df['Review_ID'] = reviews_df[
                    'Restaurant_ID'] + '000' + reviews_df['ID']
                reviews_df = reviews_df[[
                    'Review_ID', 'Restaurant_ID', 'ID', 'review_title',
                    'user_importance', 'user_name', 'user_rating',
                    'user_review'
                ]]

                reviews_df.to_csv(csv_file_name, index=False)
                reviews_df.to_csv(
                    prop.Reviews_master_file_path,
                    mode='a',
                    header=False,
                    index=False
                )  # https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file
                chrome_browser.quit()
                not_scrapped = False
                print('-' * 100)
                print('{} has been created'.format(file_name + '.csv'))
                print('Time taken to scrape and create the the above file: ',
                      datetime.now() - begin)
                print('\n\n')
                total_reviews_count += len(reviews_df)
            else:
                proxy = random_proxy()

        # with open('review_master.csv',mode='a',headers=False) as master:
    print('Total reviews scrapped: ', total_reviews_count)
    print('Time taken to scrape all the reviews: ', datetime.now() - start)
    chrome_browser.quit()