def process(id): yp = YelpAPI( 'zmd9y3Q30Zj7Ekoh8sokT1bmzw4hWXNfzpjbnjSV5GXhX6v6gKslsx7T645Dm4rBMCv-x5ZKAM_0l7-FlFJS76ev43IWXnDcwyoOwIRVZh2SGyLne_jzL3-LHAbGXHYx' ) response = yp.reviews_query(id) textArray = [] for item in response['reviews']: textArray.append(item['text']) return str(sentiment.getSentiment(textArray))
def yelpJSON(label): label = label.encode('utf-8') yelp_api = YelpAPI(application.config['YELP_API_KEY']) result = yelp_api.business_query(label) reviews = yelp_api.reviews_query(label) result['review_list'] = reviews response = jsonify(result) response.headers.add('Access-Control-Allow-Origin', application.config['ALLOWED_DOMAIN']) return response
def getYelpData(phoneNums): """ This function takes in a list of phone numbers and looks them up in Yelp Fusion using yelpapi. It then returns 2 dictionaries of the business' information and three pertinent user reviews of the business. param: phoneNums = List of phone numbers return: phoneDict = Dictionary mapping a phone number to a list. list[0] = the business' rating list[1] = the business' review count list[2] = the business' url return: reviewDict = Dictionary mapping a phone number to a list of reviews (up to 3 reviews available, chosen by Yelp algorithm) list[0] = the review's rating list[1] = the review's text list[2] = the review's time of creation (IN PACIFIC TIME) """ yelp_api = YelpAPI(APIKEY) phoneDict = dict() reviewDict = dict() #lineNum = 1 for phoneNum in phoneNums: #print(lineNum) response = yelp_api.phone_search_query(phone=str(phoneNum)) #print(phoneNum) #pprint(response) #lineNum = lineNum + 1 # If the phone number is listed in Yelp, store rating, review_count, url in # array mapped to phone number. Then, use the business-id field from # response to store 3 reviews in a separate list. Return both at end. if response['total'] != 0: business = response['businesses'][0] phoneDict[phoneNum] = [ business['rating'], business['review_count'], business['url'] ] # Get reviews using company id and store in reviewDict companyID = str(business['id']) reviewResponse = yelp_api.reviews_query(id=companyID) reviewList = reviewResponse['reviews'] # Put a list of review information in reviewDict (mapped to phone number) for review in reviewList: reviewDict[phoneNum] = [ review['rating'], review['text'], review['time_created'] ] return phoneDict, reviewDict
def getYelpData(phoneNums): """ param: phoneNums = Numpy array of phone numbers """ yelp_api = YelpAPI(APIKEY) phoneDict = dict() reviewDict = dict() #lineNum = 1 for phoneNum in phoneNums: #print(lineNum) response = yelp_api.phone_search_query(phone=str(phoneNum)) #print(phoneNum) #pprint(response) #lineNum = lineNum + 1 # If the phone number is listed in Yelp, store rating, review_count, url in # array mapped to phone number. Then, use the business-id field from # response to store 3 reviews in a separate list. Return both at end. if response['total'] != 0: business = response['businesses'][0] phoneDict[phoneNum] = [ business['rating'], business['review_count'], business['url'] ] # Get reviews using company id and store in reviewDict companyID = str(business['id']) reviewResponse = yelp_api.reviews_query(id=companyID) reviewList = reviewResponse['reviews'] # Put a list of review information in reviewDict (mapped to phone number) for review in reviewList: reviewDict[phoneNum] = [ review['rating'], review['text'], review['time_created'] ] return phoneDict, reviewDict
def get_restaurants_and_reviews_from_yelp( type, location): #0=restaurants, 1=reviews ##change to just get_restaurants? yelp_api = YelpAPI(MY_API_KEY) rests = yelp_api.search_query(term=type, location=location, limit=5) restaurants = rests['businesses'] restaurant_ids = [] reviews = [] for x in restaurants: id = x['id'] restaurant_ids.append(id) for d in restaurant_ids: # reviews = [] review = yelp_api.reviews_query(id=d) for r in review['reviews']: reviews.append(r) return restaurants, reviews
""" print("***** business information for Amy's on 6th St. *****\n{}\n".format( "yelp_api.business_query(id='amys-ice-creams-austin-3')")) response = yelp_api.business_query(id='amys-ice-creams-austin-3') pprint(response) print( '\n-------------------------------------------------------------------------\n' ) """ Example reviews query. Reviews API: https://www.yelp.com/developers/documentation/v3/business_reviews """ print("***** selected reviews for Amy's on 6th St. *****\n{}\n".format( "yelp_api.reviews_query(id='amys-ice-creams-austin-3')")) response = yelp_api.reviews_query(id='amys-ice-creams-austin-3') pprint(response) print( '\n-------------------------------------------------------------------------\n' ) """ Example autocomplete query. Autocomplete API: https://www.yelp.com/developers/documentation/v3/autocomplete centroid: https://www.flickr.com/places/info/2427422 """ print("***** autocomplete results for 'Hambur' in Iowa City *****\n{}\n".format( "yelp_api.autocomplete_query(text='Hambur', longitude=-91.5327, latitude=41.6560)" )) response = yelp_api.autocomplete_query(text='Hambur', longitude=-91.5327,
from yelpapi import YelpAPI from pprint import pprint # yelpapi requires api key to join Developer Beta. ## ref: https://www.yelp.com/developers/v3/manage_app app_secret = 'app_secret' #api key yelp_api = YelpAPI(app_secret) """ Example reviews query. Reviews API: https://www.yelp.com/developers/documentation/v3/business_reviews """ print("***** selected reviews for Universal Property and Casualty Insurance Company in Fort Lauderdale. *****\n{}\n".format("yelp_api.reviews_query(id='universal-property-and-casualty-insurance-company-fort-lauderdale')")) try: response = yelp_api.reviews_query(id='universal-property-and-casualty-insurance-company-fort-lauderdale') pprint(response) except YelpAPI.YelpAPIError as e: print(e) print('\n-------------------------------------------------------------------------\n')
""" print("***** business information for Amy's on 6th St. *****\n{}\n".format("yelp_api.business_query(id='amys-ice-" "creams-austin-3')")) response = yelp_api.business_query(id='amys-ice-creams-austin-3') pprint(response) print('\n-------------------------------------------------------------------------\n') """ Example reviews query. Reviews API: https://www.yelp.com/developers/documentation/v3/business_reviews """ print("***** selected reviews for Amy's on 6th St. *****\n{}\n".format("yelp_api.reviews_query(id='amys-ice-" "creams-austin-3')")) response = yelp_api.reviews_query(id='amys-ice-creams-austin-3') pprint(response) print('\n-------------------------------------------------------------------------\n') """ Example autocomplete query. Autocomplete API: https://www.yelp.com/developers/documentation/v3/autocomplete centroid: https://www.flickr.com/places/info/2427422 """ print("***** autocomplete results for 'Hambur' in Iowa City *****\n{}\n".format("yelp_api.autocomplete_query(" "text='Hambur', longitude=-91.5327, " "latitude=41.6560)")) response = yelp_api.autocomplete_query(text='Hambur', longitude=-91.5327, latitude=41.6560) pprint(response)
from pprint import pprint # In[6]: yelp_api = YelpAPI( "6JFAZOLb4tCd1IbWWsL6fGph_KpZQW4z5QRmrIXR0H9X23d1jDxnORB0uYrAgGSHVhtCeqjj1W-VHRGEr0zqjPKwtbcglOyiOZQ3yCgQhI7N6tYDLcOal4DqI1snW3Yx" ) # In[66]: search_results = yelp_api.search_query(term="Filipino", location=["Renton, Wa", "seattle, wa"]) # In[23]: response = yelp_api.reviews_query(id='OqrtfhUcN_El1ClubBAVPQ') # In[46]: pprint(search_results.get('businesses')[0]['alias']) # In[83]: storelist = [] locationlist = [ "Renton, Wa", "Seattle, WA", "Tacoma, WA", "Everett, WA", "Federal Way, WA" ] for location in locationlist: search_results = yelp_api.search_query(term="Filipino", location=location) for i in range(len(search_results.get('businesses'))): storelist.append(
class YelpData(object): """ This class will complete handle the calls for Yelp Data Business API - business_query() Business Match API - business_match_query() Reviews API - reviews_query() """ def __init__(self): self.business_match = mongodb.db.business_match self.business_details = mongodb.db.business_details self.business_reviews = mongodb.db.business_reviews self.yelp_req = mongodb.db.yelp_request self.yelp_api = YelpAPI(app.config['YELP_API_KEY']) self.response = None @staticmethod def _remove_keys(json_data): del json_data['user_id'] del json_data['_id'] return json_data def get_business_match_data(self, user_id=None, name=None, address1='', address2=None, address3=None, city=None, state=None, country=None, latitude=None, longitude=None, phone=None, zip_code=None, yelp_business_id=None, limit=1, match_threshold='default'): """ Link: https://www.yelp.com/developers/documentation/v3/business_match required parameters: * name - business name * city * state * country """ self.response = self.yelp_api.business_match_query( name=name, address1=address1, address2=address2, address3=address3, city=city, state=state, country=country, latitude=latitude, longitude=longitude, phone=phone, zip_code=zip_code, yelp_business_id=yelp_business_id, limit=limit, match_threshold=match_threshold) self.response['user_id'] = user_id self.business_match.insert_one(self.response) self.response = self._remove_keys(self.response) return self.response def get_business_details(self, business_id, user_id): self.response = self.yelp_api.business_query(id=business_id) self.response['user_id'] = user_id self.business_details.insert_one(self.response) self.response = self._remove_keys(self.response) return self.response def get_business_reviews(self, business_id, user_id): self.response = self.yelp_api.reviews_query(id=self.business_id) self.response['user_id'] = user_id self.business_reviews.insert_one(self.response) self.response = self._remove_keys(self.response) return self.response def yelp_request(self, yelp_request, user_id): yelp_request['req_datetime'] = datetime.datetime.now() yelp_request['user_id'] = user_id return self.yelp_req.insert_one(yelp_request).acknowledged
# we shouldn't need them unecessary_cols = ['phone', 'display_phone', 'transactions', 'is_closed','image_url'] business_df2 = business_df.drop(unecessary_cols,1) #loop through businesses reviews = dict() reviews_df = pd.DataFrame() for iBiz, biz_id in enumerate(business_df2.loc[:,'id']): business_name = business_df2['name'][iBiz] #can only get 3 reviews through yelp api #BUT...we have the url...which means it should be easy to "not legally" scrape reviews[business_name] = yelp_api.reviews_query(biz_id) # temporary data frame we can use that will be appended to a master one later temp_df = pd.DataFrame.from_dict(reviews[business_name]['reviews']) temp_df = temp_df.drop('user',1) # add column for ISP provider temp_df.insert(0,'ISP_name',business_name) # add column for business id temp_df.insert(1,'business_id',biz_id) # add column for business location temp_df.insert(6,'location', str(business_df2[business_df2['id'] == biz_id]['location'].item()['display_address']))
from yelpapi import YelpAPI from misty.utils import print_and_say YELP_API_KEY = 'h81ylaT0alwtJCUUyI7RazCCHNHleVGnhD9ZONPT1s4kL9v5qhCXPZrcI20H4LYisDEjJZu_j4ibEsSTpM2ISDpWBeraK3t42rwV_PhxtYvmatDn2xquIUKdueYtYHYx' # plz no steal my api keyz! client = YelpAPI(YELP_API_KEY) biz_ids = [ 'pierce-j-r-plumbing-co-inc-of-sacramento-rocklin', 'ncm-roseville', ] random.shuffle(biz_ids) for biz_id in biz_ids: result = client.business_query(id=biz_id) print_and_say( f"{result['name']}. Phone number: {result['display_phone']}. Address: {''.join(result['location']['display_address'])}", next_voice=True) reviews = client.reviews_query(id=result['id']) print_and_say( f"Retrieved {len(reviews['reviews'])} of {reviews['total']} reviews.", next_voice=True) for review in reviews['reviews']: print_and_say( f"On {review['time_created']} {review['user']['name']} gave a rating of {review['rating']} stars, stating: {review['text']}.", next_voice=True)
class scrappers: data_path = "././data/raw" def __init__(self): __dir_path = os.path.dirname(os.path.realpath(__file__)) credentials = get_credidentials() self.twitter_premium_api = load_credentials( filename="{}/{}".format(__dir_path, "twitter_keys.yaml"), yaml_key="search_tweets_api_30day") self.twitter_api = Twitter(auth=OAuth( consumer_key=credentials['twitter']['consumer_key'], consumer_secret=credentials['twitter']['consumer_secret'], token=credentials['twitter']['access_token_key'], token_secret=credentials['twitter']['access_token_secret'])) self.yelp_api = YelpAPI(credentials['yelp']['api_key']) self.__data_path = "../data/raw" logger.info("initiation started.") def tw_verify_credentials(self): obj = self.twitter_api.VerifyCredentials() print(json.dumps(obj._json, indent=4, sort_keys=True)) def tw_get_statuses(self, user_list): for username in user_list: with open(f'datasets/tw_{username}_statuses.json', 'w') as f: try: f.write('{"statuses": [') max_id = 0 while (True): # status scheme available at: https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html statuses = self.twitter_api.GetUserTimeline( screen_name=username, count=100, max_id=max_id) if len(statuses) == 1 and statuses[0].id == max_id: break else: for status in statuses: if status.id != max_id: f.write("%s," % json.dumps(status._json)) max_id = statuses[-1].id finally: max_id != 0 and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") def tw_get_search(self, user_list): for user_name, keyword_list in user_list.items(): with open(f'datasets/tw_{user_name}_searches.json', 'w') as f: try: f.write('{"statuses": [') max_id = 0 user = self.twitter_api.GetUser(screen_name=user_name) keyword_list.append(f'{user.name}') keyword_list.append(f'{user_name}') keyword_list.append(f'#{user_name}') keyword_list.append(f'@{user_name}') term = '{}'.format(' OR '.join(keyword_list)) while (True): # status scheme available at: https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets.html statuses = self.twitter_api.GetSearch( term=term.encode('utf-8'), geocode=None, count=100, max_id=max_id) if (len(statuses) == 1 and statuses[0].id == max_id) or statuses == []: break else: for status in statuses: if status.id != max_id: """status_text = json.dumps(status._json) status_json = json.loads(status_text) status_json['keyword'] = keyword""" f.write("%s," % json.dumps(status._json)) max_id = statuses[-1].id finally: max_id != 0 and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") def tw_get_premium_search(self, keyword: str): with open(f'datasets/tw_{keyword.lower()}_searches_premium.json', 'w') as f: try: f.write('{"statuses": [') rule = gen_rule_payload( pt_rule="near:\"New York, NY\" within:50mi".format(), results_per_call=100, from_date="2018-07-01", to_date="2018-10-01") rule = gen_rule_payload( pt_rule="place:\"New York, NY\"".format(), results_per_call=100, from_date=(datetime.date.today() - datetime.timedelta(31)).isoformat(), to_date=datetime.date.today().isoformat()) next_token = None while True: results = ResultStream(rule_payload=rule, **self.twitter_premium_api) results.next_token = next_token tweets = [] try: tweets = list(results.stream()) except Exception as ex: print(str(ex)) for tweet in tweets: f.write("%s," % json.dumps(tweet)) if results.next_token is None: break else: next_token = results.next_token next_token is not None and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") except Exception as ex: print("Error:\n" + str(ex)) def yp_get_businesses(self, business_list): """ Get reviews for each business in the business_list and creates separate data files. File Type: JSON """ for business in business_list: with open(f'{self.data_path}/yp_{business}_competitors.json', 'w') as f: try: f.write('{"businesses": [') branch = self.yelp_api.business_query(business) offset = 0 while (True): try: # status scheme available at: # https://www.yelp.com/developers/documentation/v3/business_search competitors = self.yelp_api.search_query( longitude=branch['coordinates']['longitude'], latitude=branch['coordinates']['latitude'], radius=40000, # categories='bars,french' sort_by='distance', limit=50, offset=offset) f.write("%s," % json.dumps(competitors['businesses'])) offset = offset + 50 except self.yelp_api.YelpAPIError: break finally: offset != 0 and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") def yp_get_competitors(self, business_list): """ Gets business list in consideration to the existing business list file. Adds any additional business, if it is not recorded yet. """ file_path = fp.yp_raw_competitors(self.data_path) index_list = [] existing_list = [] """ if os.path.exists(file_path): with open(file_path, 'r') as f: current_file = f.readlines() if len(current_file) > 0: existing_list = json.loads(current_file[0]) index_list = [_business["alias"] for _business in existing_list] logger.info(f"existing file found: {len(index_list)} total entries") """ with open(file_path, 'w') as f: # find businesses for business in business_list: new_list = [] try: logger.info(f"import started for : {business}") branch = self.yelp_api.business_query(business) offset = 0 while (True): try: # status scheme available at: # https://www.yelp.com/developers/documentation/v3/business_search competitors = self.yelp_api.search_query( longitude=branch['coordinates']['longitude'], latitude=branch['coordinates']['latitude'], radius=40000, # categories='bars,french' sort_by='distance', limit=50, offset=offset) # add alias name for distance measurement as dist_to_alias businesses = competitors["businesses"] [ i.update({"dist_to_alias": business}) for i in businesses ] for i in businesses: if i['alias'] not in index_list: new_list.append(i) index_list.append(i['alias']) offset = offset + 50 except self.yelp_api.YelpAPIError: break finally: existing_list.extend(new_list) logger.info( f"import completed. existing: {len(existing_list)} new: {len(new_list)}" ) # saving into file json.dump(existing_list, f) def yp_get_business_reviews(self, business_list): """ Gets three reviews from the yelp api. """ for business in business_list: with open(f'{self.data_path}/yp_{business}_rws.json', 'w') as f: try: f.write('{"reviews": [') offset = 0 while (True): reviews_set = self.yelp_api.reviews_query( business, limit=5, offset=offset) reviews = reviews_set['reviews'] if len(reviews) > 0: for review in reviews: f.write("%s,\n" % review) offset = offset + 5 else: break finally: offset != 0 and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") def yp_get_competitor_reviews(self, business_list=None, start_index=0, end_index=5): """ Gets reviews by scraping through the site. Reviews are saved by business name and reviews. Uses Competitors reviews file as default file. Given index controls regions of Competitors. business_list: None or List start_index: int, interested region's starting index end_index: int, interested region's ending index File Type: CSV """ file_path = fp.yp_raw_competitors_reviews(self.data_path) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } columns = [ 'alias', 'ratingValue', 'dataPublished', 'description', 'author' ] df: pd.DataFrame # getting competitors list businesses_file_path = fp.yp_raw_competitors(self.data_path) businesses_index_list = [] if os.path.exists(businesses_file_path): with open(businesses_file_path, 'r') as f: current_file = f.readlines() if len(current_file) > 0: businesses_index_list = [ _business["alias"] for _business in json.loads(current_file[0]) ] # needed every time if os.path.exists(file_path): with open(file_path, 'r') as f: df = pd.read_csv(file_path) logger.info( f"existing file found. total reviews count: {len(df)}") # need only once, if file doesn't exists if os.path.exists(file_path) is False: with open(file_path, 'w') as f: writer = csv.writer(f) writer.writerow(columns) logger.info("file created at: {}".format(file_path)) # ops with open(file_path, 'a', newline='') as f: if business_list is None: business_list = businesses_index_list current_index = start_index - 1 for business in business_list[start_index:end_index]: cnt_imported = 0 current_index = current_index + 1 logger.info(f"index: {current_index} of {end_index - 1}") try: writer = csv.writer(f) logger.info(f"import started for : {business}") start = 0 cnt_requests = 0 while (True): url = '{}/{}?sort_by=date_desc&start={}'.format( 'https://www.yelp.com/biz', business, start) response = requests.get(url, headers) soup = BeautifulSoup(response.text, 'html.parser') html_script = soup.findAll( 'script', {'type': 'application/ld+json'})[-1] obj = json.loads(html_script.string) reviews = obj['review'] if len(reviews) > 0: for review in reviews: data = [ business, review['reviewRating']['ratingValue'], review['datePublished'], review['description'], review['author'] ] check = np.array(data, dtype='O') if not (df.values == check).all(1).any(): writer.writerow(data) cnt_imported = cnt_imported + 1 start = start + 20 cnt_requests = cnt_requests + 1 else: logger.info( f"import completed. total reviews cnt: {cnt_imported} total request cnt: {cnt_requests}" ) break except Exception as ex: logger.warning( f"error: alias: {business} index: {current_index} total reviews cnt: {cnt_imported}" ) logger.warning(f"error message: {ex}") logger.warning("Let me sleep for some time..") second = int(round(random.expovariate(1) * 100)) time.sleep(second) logger.warning( f"{second} seconds slept, now back on scrapping..") continue def yp_get_business_reviews2(self, business_list): """ Gets reviews by scraping through the site. """ for business in business_list: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } with open(f'{self.data_path}/yp_{business}_rws.json', 'w') as f: try: f.write('{"reviews": [') start = 0 while (True): url = '{}/{}?sort_by=date_desc&start={}'.format( 'https://www.yelp.com/biz', business, start) response = requests.get(url, headers) soup = BeautifulSoup(response.text, 'html.parser') html_script = soup.find( 'script', {'type': 'application/ld+json'}) obj = json.loads(html_script.string) reviews = obj['review'] if len(reviews) > 0: for review in reviews: data = { 'ratingValue': review['reviewRating']['ratingValue'], 'datePublished': review['datePublished'], 'description': review['description'], 'author': review['author'] } f.write("%s," % json.dumps(data)) start = start + 20 else: break finally: start != 0 and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") with open(f'datasets/yp_businesses.json', 'a') as f: obj['review'] = []
from yelpapi import YelpAPI yelp_api = YelpAPI( '9pdRr_PCd8m5NcXB6bb6cdRmKgeXzz-TUrQfdn8lCZrdEVFqMa3YSygop0Mnp-xKUjTzBZv5VsGh3mW8bE_dM5d24Y7u0cDhKMNG40AqRUTYjc1PSlXaWA1ce9UsXXYx' ) response = yelp_api.search_query(term='olive garden', location='san jose', limit='5') print(yelp_api.reviews_query(id="kyCAxIcS_axZB12EtHV1FA"))
business_results_final=[] argparser = argparse.ArgumentParser(description='Example Yelp queries using yelpapi. Visit https://www.yelp.com/developers/v3/manage_app to get the necessary API keys.') argparser.add_argument('--client_id', type=str, help='Yelp Fusion API client ID') argparser.add_argument('--client_secret', type=str, help='Yelp Fusion API client secret') args = argparser.parse_args() yelp_api = YelpAPI(args.client_id, args.client_secret) response = yelp_api.search_query(term='restaurant', location='houston, tx', sort_by='rating', limit=50) with open('yelp_{}_{}.json'.format('Rest1','HOU'),'w') as f: json.dump(response, f, indent=5) print('\n-------------------------------------------------------------------------\n') for i in range(0,len(response['businesses'])): business_id.append(response['businesses'][i]['id']) for i in range(0,len(business_id)): business_results = yelp_api.reviews_query(id=business_id[i]) business_results_final.append(business_results) dump1=json.dumps(business_results_final,indent=5) df = pd.read_json(dump1) for item in df.values: for review in item[0]: review_result.append(repr(review['text'].replace('\n','').encode('utf-8'))) with open("YelpReviews.txt",'w') as out_file: for line in review_result: out_file.write(line+"\n") print ("DONE")
city_name_country = city_name_split[1].rsplit(",", 1) city_name = city_name_country[0] city_country = city_name_country[1] city_key = city_name_split[0] cityStr = city_name.split(",")[0] city_code = city_key print(city_code + " " + cityStr + " " + city_country + " ///" + city_name) response = yelp.search_query(term='food', location=city_name, price='1,2,3,4', sort_by='best_match', limit=10) data = json.dumps(response) restaurants_json = json.loads(data) city_dict = {} city_dict['businesses'] = [] for p in restaurants_json['businesses']: if ('price' in p): review = yelp.reviews_query(id=p['id'], sort_by='rating', limit=3) details = yelp.business_query(id=p['id']) data = json.dumps(review) detaildata = json.dumps(details) reviews_json = json.loads(data) details_json = json.loads(detaildata) reviews_dict = {} reviews_dict['reviews'] = [] for q in reviews_json['reviews']: reviews_dict['reviews'].append({ 'user_name' : q['user']['name'], 'text' : q['text'], 'stars' : q['rating'] }) doc_dict = {} doc_dict['name'] = p['name'],
def Yelp_ScrapeISP(api_key, city_names, business_data='businesses.csv', business_reviews='businesses_reviews.csv'): """ ==================================================================== Version: 1.0.1 Date: Tue 24 Nov 2020 Purposes: Search and save yelp data about internet service providers within a region. Input: Required: api_key = Api key assigned by yelp fusion city_names = List of locations for internet service providers Opitional: business_data = .csv file containing data from previous searches business_reviews = .csv file containing reviews of businesses from previous searches Output: 'businesses.csv' containing information about internet service providers 'businesses_reviews.csv' containing reviews of internet service providers in the businesses.csv file 'cities_list.txt' list of previous cities that have been searched. Data will only be extracted for cities that have not been previously searched. Example: ca_cities_df = pd.read_csv('cal_cities_lat_long.csv') ca_cities = ca_cities_df['Name'] + ', CA' api_key = XIXIXIXLXJO Yelp_ScrapeISP(api_key,ca_cities) Author: Jordan Garrett [email protected] ==================================================================== """ data_dir = os.path.join(os.getcwd(), 'Yelp_Data\\') #check to see if any cities in the list have previously been searched if os.path.exists(data_dir + 'cities_list.txt'): try: prev_cities = pickle.load(open(data_dir + 'cities_list.txt', 'rb')) except EOFError: prev_cities = [] city_names = [city for city in city_names if city not in prev_cities] if city_names: print(f'Searching Cities: {city_names}') else: print('All cities have already been searched') return yelp_api = YelpAPI(api_key) # add in pauses to prevent stop errors from too much scraping time.sleep(3) all_business_df = pd.DataFrame() all_reviews_df = pd.DataFrame() try: failed_searches = pickle.load( open(data_dir + 'failed_searches.txt', 'rb')) except EOFError: failed_searches = [] n_failed_searches = len(failed_searches) for iCity in city_names: try: # we can play around with the limit and offset parameters # to control the number of results and what item to start the pull on search_results = yelp_api.search_query( term='Internet Service Providers', location=iCity, limit=50) time.sleep(3) business_df = pd.DataFrame.from_dict(search_results['businesses']) # some regions may return empty results if business_df.empty: print(f'No data from: {iCity}') continue # drop the phone, display_phone, transactions, is_closed, and image_url columns # we shouldn't need them unecessary_cols = [ 'phone', 'display_phone', 'transactions', 'is_closed', 'image_url' ] business_df2 = business_df.drop(unecessary_cols, 1) #loop through businesses reviews = dict() reviews_df = pd.DataFrame() for iBiz, biz_id in enumerate(business_df2.loc[:, 'id']): business_name = business_df2['name'][iBiz] #can only get 3 reviews through yelp api #BUT...we have the url...which means it should be easy to "not legally" scrape reviews[business_name] = yelp_api.reviews_query(biz_id) # temporary data frame we can use that will be appended to a master one later temp_df = pd.DataFrame.from_dict( reviews[business_name]['reviews']) temp_df = temp_df.drop('user', 1) # add column for ISP provider temp_df.insert(0, 'ISP_name', business_name) # add column for business id temp_df.insert(1, 'business_id', biz_id) # add column for business location temp_df.insert( 6, 'location', str(business_df2[business_df2['id'] == biz_id] ['location'].item()['display_address'])) temp_df = temp_df.rename(columns={"id": "rev_id"}) reviews_df = reviews_df.append(temp_df, ignore_index=True) all_business_df = all_business_df.append(business_df2, ignore_index=True) all_reviews_df = all_reviews_df.append(reviews_df, ignore_index=True) # Save data # if no previous files, just save the data. if previous files, append if business_data == None and business_reviews == None: all_business_df.to_csv(data_dir + 'businesses.csv', index=False) all_reviews_df.to_csv(data_dir + 'businesses_reviews.csv', index=False) else: #append data to previous loaded files prev_business_df = pd.read_csv(data_dir + business_data) prev_reviews_df = pd.read_csv(data_dir + business_reviews) new_business_df = prev_business_df.append(all_business_df, ignore_index=True) new_reviews_df = prev_reviews_df.append(all_reviews_df, ignore_index=True) new_business_df.to_csv(data_dir + 'businesses.csv', index=False) new_reviews_df.to_csv(data_dir + 'businesses_reviews.csv', index=False) # Save previous cities to ensure that we aren't looking at cities previously searched pickle.dump(prev_cities + city_names, open(data_dir + "cities_list.txt", "wb")) except YelpAPI.YelpAPIError as yelp_error: print(str(yelp_error) + '\n') if 'ACCESS_LIMIT_REACHED' in str(yelp_error): break else: yelp_error = sys.exc_info()[0] print(iCity, yelp_error) failed_searches.append(iCity + '\n') continue except: e = sys.exc_info()[0] print(iCity, e) failed_searches.append(iCity + '\n') continue # saved new failed searches if n_failed_searches < len(failed_searches): pickle.dump(failed_searches, open(data_dir + "failed_searches.txt", "wb"))
search_results = yelp_api.search_query(term='Pizza', location='Halifax', sort_by='rating', limit=50) businessIds = list() businessRating = list() #businessPhotos = list() #businessReviews = list() for result in search_results["businesses"]: businessIds.append(result["id"]) businessRating.append(result["rating"]) for businessId in businessIds: businessPhotos = yelp_api.business_query(id=businessId)["photos"] businessReviews = yelp_api.reviews_query(id=businessId)["reviews"] os.mkdir(businessId) i = 0 for photo in businessPhotos: savefile = str(businessId + "/" + str(i) + ".jpg") urllib.request.urlretrieve(photo, savefile) i = i + 1 i = 0 for review in businessReviews: savefile = str(businessId + "/" + str(i) + ".txt") fsavefile = open(savefile, "w") fsavefile.write(review["text"]) i = i + 1