Example #1
0
def process(id):
    yp = YelpAPI(
        'zmd9y3Q30Zj7Ekoh8sokT1bmzw4hWXNfzpjbnjSV5GXhX6v6gKslsx7T645Dm4rBMCv-x5ZKAM_0l7-FlFJS76ev43IWXnDcwyoOwIRVZh2SGyLne_jzL3-LHAbGXHYx'
    )
    response = yp.reviews_query(id)
    textArray = []
    for item in response['reviews']:
        textArray.append(item['text'])

    return str(sentiment.getSentiment(textArray))
Example #2
0
def yelpJSON(label):
    label = label.encode('utf-8')
    yelp_api = YelpAPI(application.config['YELP_API_KEY'])
    result = yelp_api.business_query(label)
    reviews = yelp_api.reviews_query(label)
    result['review_list'] = reviews
    response = jsonify(result)
    response.headers.add('Access-Control-Allow-Origin',
                         application.config['ALLOWED_DOMAIN'])
    return response
def getYelpData(phoneNums):
    """
  This function takes in a list of phone numbers and looks them up in Yelp
  Fusion using yelpapi. It then returns 2 dictionaries of the business'
  information and three pertinent user reviews of the business.

  param: phoneNums = List of phone numbers
  return: phoneDict = Dictionary mapping a phone number to a list.
                      list[0] = the business' rating
                      list[1] = the business' review count
                      list[2] = the business' url
  return: reviewDict = Dictionary mapping a phone number to a list of reviews
  (up to 3 reviews available, chosen by Yelp algorithm)
                      list[0] = the review's rating
                      list[1] = the review's text
                      list[2] = the review's time of creation (IN PACIFIC TIME)
  """

    yelp_api = YelpAPI(APIKEY)

    phoneDict = dict()
    reviewDict = dict()
    #lineNum = 1

    for phoneNum in phoneNums:
        #print(lineNum)
        response = yelp_api.phone_search_query(phone=str(phoneNum))
        #print(phoneNum)
        #pprint(response)

        #lineNum = lineNum + 1

        # If the phone number is listed in Yelp, store rating, review_count, url in
        # array mapped to phone number. Then, use the business-id field from
        # response to store 3 reviews in a separate list. Return both at end.
        if response['total'] != 0:
            business = response['businesses'][0]
            phoneDict[phoneNum] = [
                business['rating'], business['review_count'], business['url']
            ]

            # Get reviews using company id and store in reviewDict
            companyID = str(business['id'])
            reviewResponse = yelp_api.reviews_query(id=companyID)
            reviewList = reviewResponse['reviews']

            # Put a list of review information in reviewDict (mapped to phone number)
            for review in reviewList:
                reviewDict[phoneNum] = [
                    review['rating'], review['text'], review['time_created']
                ]

    return phoneDict, reviewDict
Example #4
0
def getYelpData(phoneNums):
    """
  param: phoneNums = Numpy array of phone numbers
  """

    yelp_api = YelpAPI(APIKEY)

    phoneDict = dict()
    reviewDict = dict()
    #lineNum = 1

    for phoneNum in phoneNums:
        #print(lineNum)
        response = yelp_api.phone_search_query(phone=str(phoneNum))
        #print(phoneNum)
        #pprint(response)

        #lineNum = lineNum + 1

        # If the phone number is listed in Yelp, store rating, review_count, url in
        # array mapped to phone number. Then, use the business-id field from
        # response to store 3 reviews in a separate list. Return both at end.
        if response['total'] != 0:
            business = response['businesses'][0]
            phoneDict[phoneNum] = [
                business['rating'], business['review_count'], business['url']
            ]

            # Get reviews using company id and store in reviewDict
            companyID = str(business['id'])
            reviewResponse = yelp_api.reviews_query(id=companyID)
            reviewList = reviewResponse['reviews']

            # Put a list of review information in reviewDict (mapped to phone number)
            for review in reviewList:
                reviewDict[phoneNum] = [
                    review['rating'], review['text'], review['time_created']
                ]

    return phoneDict, reviewDict
Example #5
0
def get_restaurants_and_reviews_from_yelp(
        type,
        location):  #0=restaurants, 1=reviews ##change to just get_restaurants?
    yelp_api = YelpAPI(MY_API_KEY)
    rests = yelp_api.search_query(term=type, location=location, limit=5)
    restaurants = rests['businesses']

    restaurant_ids = []
    reviews = []

    for x in restaurants:
        id = x['id']
        restaurant_ids.append(id)

    for d in restaurant_ids:
        # reviews = []
        review = yelp_api.reviews_query(id=d)

        for r in review['reviews']:
            reviews.append(r)

    return restaurants, reviews
Example #6
0
"""
print("***** business information for Amy's on 6th St. *****\n{}\n".format(
    "yelp_api.business_query(id='amys-ice-creams-austin-3')"))
response = yelp_api.business_query(id='amys-ice-creams-austin-3')
pprint(response)
print(
    '\n-------------------------------------------------------------------------\n'
)
"""
    Example reviews query.
    
    Reviews API: https://www.yelp.com/developers/documentation/v3/business_reviews
"""
print("***** selected reviews for Amy's on 6th St. *****\n{}\n".format(
    "yelp_api.reviews_query(id='amys-ice-creams-austin-3')"))
response = yelp_api.reviews_query(id='amys-ice-creams-austin-3')
pprint(response)
print(
    '\n-------------------------------------------------------------------------\n'
)
"""
    Example autocomplete query.
    
    Autocomplete API: https://www.yelp.com/developers/documentation/v3/autocomplete
    centroid: https://www.flickr.com/places/info/2427422
"""
print("***** autocomplete results for 'Hambur' in Iowa City *****\n{}\n".format(
    "yelp_api.autocomplete_query(text='Hambur', longitude=-91.5327, latitude=41.6560)"
))
response = yelp_api.autocomplete_query(text='Hambur',
                                       longitude=-91.5327,
Example #7
0
from yelpapi import YelpAPI
from pprint import pprint

# yelpapi requires api key to join Developer Beta.
## ref: https://www.yelp.com/developers/v3/manage_app

app_secret = 'app_secret' #api key
yelp_api = YelpAPI(app_secret)

"""
    Example reviews query.
    
    Reviews API: https://www.yelp.com/developers/documentation/v3/business_reviews
"""
print("***** selected reviews for Universal Property and Casualty Insurance Company in Fort Lauderdale. *****\n{}\n".format("yelp_api.reviews_query(id='universal-property-and-casualty-insurance-company-fort-lauderdale')"))
try:
    response = yelp_api.reviews_query(id='universal-property-and-casualty-insurance-company-fort-lauderdale')
    pprint(response)
except YelpAPI.YelpAPIError as e:
    print(e)
print('\n-------------------------------------------------------------------------\n')
Example #8
0
"""
print("***** business information for Amy's on 6th St. *****\n{}\n".format("yelp_api.business_query(id='amys-ice-"
                                                                           "creams-austin-3')"))
response = yelp_api.business_query(id='amys-ice-creams-austin-3')
pprint(response)
print('\n-------------------------------------------------------------------------\n')


"""
    Example reviews query.
    
    Reviews API: https://www.yelp.com/developers/documentation/v3/business_reviews
"""
print("***** selected reviews for Amy's on 6th St. *****\n{}\n".format("yelp_api.reviews_query(id='amys-ice-"
                                                                       "creams-austin-3')"))
response = yelp_api.reviews_query(id='amys-ice-creams-austin-3')
pprint(response)
print('\n-------------------------------------------------------------------------\n')


"""
    Example autocomplete query.
    
    Autocomplete API: https://www.yelp.com/developers/documentation/v3/autocomplete
    centroid: https://www.flickr.com/places/info/2427422
"""
print("***** autocomplete results for 'Hambur' in Iowa City *****\n{}\n".format("yelp_api.autocomplete_query("
                                                                                "text='Hambur', longitude=-91.5327, "
                                                                                "latitude=41.6560)"))
response = yelp_api.autocomplete_query(text='Hambur', longitude=-91.5327, latitude=41.6560)
pprint(response)
Example #9
0
from pprint import pprint

# In[6]:

yelp_api = YelpAPI(
    "6JFAZOLb4tCd1IbWWsL6fGph_KpZQW4z5QRmrIXR0H9X23d1jDxnORB0uYrAgGSHVhtCeqjj1W-VHRGEr0zqjPKwtbcglOyiOZQ3yCgQhI7N6tYDLcOal4DqI1snW3Yx"
)

# In[66]:

search_results = yelp_api.search_query(term="Filipino",
                                       location=["Renton, Wa", "seattle, wa"])

# In[23]:

response = yelp_api.reviews_query(id='OqrtfhUcN_El1ClubBAVPQ')

# In[46]:

pprint(search_results.get('businesses')[0]['alias'])

# In[83]:

storelist = []
locationlist = [
    "Renton, Wa", "Seattle, WA", "Tacoma, WA", "Everett, WA", "Federal Way, WA"
]
for location in locationlist:
    search_results = yelp_api.search_query(term="Filipino", location=location)
    for i in range(len(search_results.get('businesses'))):
        storelist.append(
Example #10
0
class YelpData(object):
    """
    This class will complete handle the calls for Yelp Data
    Business API - business_query()
    Business Match API - business_match_query()
    Reviews API - reviews_query()
    """
    def __init__(self):
        self.business_match = mongodb.db.business_match
        self.business_details = mongodb.db.business_details
        self.business_reviews = mongodb.db.business_reviews
        self.yelp_req = mongodb.db.yelp_request
        self.yelp_api = YelpAPI(app.config['YELP_API_KEY'])
        self.response = None

    @staticmethod
    def _remove_keys(json_data):
        del json_data['user_id']
        del json_data['_id']
        return json_data

    def get_business_match_data(self,
                                user_id=None,
                                name=None,
                                address1='',
                                address2=None,
                                address3=None,
                                city=None,
                                state=None,
                                country=None,
                                latitude=None,
                                longitude=None,
                                phone=None,
                                zip_code=None,
                                yelp_business_id=None,
                                limit=1,
                                match_threshold='default'):
        """
                    Link: https://www.yelp.com/developers/documentation/v3/business_match
                    required parameters:
                        * name - business name
                        * city
                        * state
                        * country
        """
        self.response = self.yelp_api.business_match_query(
            name=name,
            address1=address1,
            address2=address2,
            address3=address3,
            city=city,
            state=state,
            country=country,
            latitude=latitude,
            longitude=longitude,
            phone=phone,
            zip_code=zip_code,
            yelp_business_id=yelp_business_id,
            limit=limit,
            match_threshold=match_threshold)
        self.response['user_id'] = user_id
        self.business_match.insert_one(self.response)
        self.response = self._remove_keys(self.response)
        return self.response

    def get_business_details(self, business_id, user_id):
        self.response = self.yelp_api.business_query(id=business_id)
        self.response['user_id'] = user_id
        self.business_details.insert_one(self.response)
        self.response = self._remove_keys(self.response)
        return self.response

    def get_business_reviews(self, business_id, user_id):
        self.response = self.yelp_api.reviews_query(id=self.business_id)
        self.response['user_id'] = user_id
        self.business_reviews.insert_one(self.response)
        self.response = self._remove_keys(self.response)
        return self.response

    def yelp_request(self, yelp_request, user_id):
        yelp_request['req_datetime'] = datetime.datetime.now()
        yelp_request['user_id'] = user_id
        return self.yelp_req.insert_one(yelp_request).acknowledged
Example #11
0
        # we shouldn't need them
        unecessary_cols = ['phone', 'display_phone', 'transactions', 'is_closed','image_url']


        business_df2 = business_df.drop(unecessary_cols,1)

        #loop through businesses
        reviews = dict()

        reviews_df = pd.DataFrame()
        for iBiz, biz_id in enumerate(business_df2.loc[:,'id']):
            business_name = business_df2['name'][iBiz]

            #can only get 3 reviews through yelp api
            #BUT...we have the url...which means it should be easy to "not legally" scrape
            reviews[business_name] = yelp_api.reviews_query(biz_id)

            # temporary data frame we can use that will be appended to a master one later
            temp_df = pd.DataFrame.from_dict(reviews[business_name]['reviews'])

            temp_df = temp_df.drop('user',1)

            # add column for ISP provider
            temp_df.insert(0,'ISP_name',business_name)

            # add column for business id
            temp_df.insert(1,'business_id',biz_id)

            # add column for business location
            temp_df.insert(6,'location',
                           str(business_df2[business_df2['id'] == biz_id]['location'].item()['display_address']))
Example #12
0
from yelpapi import YelpAPI

from misty.utils import print_and_say

YELP_API_KEY = 'h81ylaT0alwtJCUUyI7RazCCHNHleVGnhD9ZONPT1s4kL9v5qhCXPZrcI20H4LYisDEjJZu_j4ibEsSTpM2ISDpWBeraK3t42rwV_PhxtYvmatDn2xquIUKdueYtYHYx'  # plz no steal my api keyz!
client = YelpAPI(YELP_API_KEY)

biz_ids = [
    'pierce-j-r-plumbing-co-inc-of-sacramento-rocklin',
    'ncm-roseville',
]

random.shuffle(biz_ids)

for biz_id in biz_ids:
    result = client.business_query(id=biz_id)

    print_and_say(
        f"{result['name']}. Phone number: {result['display_phone']}. Address: {''.join(result['location']['display_address'])}",
        next_voice=True)

    reviews = client.reviews_query(id=result['id'])

    print_and_say(
        f"Retrieved {len(reviews['reviews'])} of {reviews['total']} reviews.",
        next_voice=True)
    for review in reviews['reviews']:
        print_and_say(
            f"On {review['time_created']} {review['user']['name']} gave a rating of {review['rating']} stars, stating: {review['text']}.",
            next_voice=True)
class scrappers:
    data_path = "././data/raw"

    def __init__(self):
        __dir_path = os.path.dirname(os.path.realpath(__file__))
        credentials = get_credidentials()
        self.twitter_premium_api = load_credentials(
            filename="{}/{}".format(__dir_path, "twitter_keys.yaml"),
            yaml_key="search_tweets_api_30day")
        self.twitter_api = Twitter(auth=OAuth(
            consumer_key=credentials['twitter']['consumer_key'],
            consumer_secret=credentials['twitter']['consumer_secret'],
            token=credentials['twitter']['access_token_key'],
            token_secret=credentials['twitter']['access_token_secret']))
        self.yelp_api = YelpAPI(credentials['yelp']['api_key'])
        self.__data_path = "../data/raw"
        logger.info("initiation started.")

    def tw_verify_credentials(self):
        obj = self.twitter_api.VerifyCredentials()
        print(json.dumps(obj._json, indent=4, sort_keys=True))

    def tw_get_statuses(self, user_list):
        for username in user_list:
            with open(f'datasets/tw_{username}_statuses.json', 'w') as f:
                try:
                    f.write('{"statuses": [')
                    max_id = 0
                    while (True):
                        # status scheme available at: https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html
                        statuses = self.twitter_api.GetUserTimeline(
                            screen_name=username, count=100, max_id=max_id)

                        if len(statuses) == 1 and statuses[0].id == max_id:
                            break
                        else:
                            for status in statuses:
                                if status.id != max_id:
                                    f.write("%s," % json.dumps(status._json))

                            max_id = statuses[-1].id
                finally:
                    max_id != 0 and f.seek(f.tell() - 1, os.SEEK_SET)
                    f.write("]}")

    def tw_get_search(self, user_list):
        for user_name, keyword_list in user_list.items():
            with open(f'datasets/tw_{user_name}_searches.json', 'w') as f:
                try:
                    f.write('{"statuses": [')
                    max_id = 0
                    user = self.twitter_api.GetUser(screen_name=user_name)
                    keyword_list.append(f'{user.name}')
                    keyword_list.append(f'{user_name}')
                    keyword_list.append(f'#{user_name}')
                    keyword_list.append(f'@{user_name}')
                    term = '{}'.format(' OR '.join(keyword_list))
                    while (True):
                        # status scheme available at: https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets.html
                        statuses = self.twitter_api.GetSearch(
                            term=term.encode('utf-8'),
                            geocode=None,
                            count=100,
                            max_id=max_id)

                        if (len(statuses) == 1 and statuses[0].id
                                == max_id) or statuses == []:
                            break
                        else:
                            for status in statuses:
                                if status.id != max_id:
                                    """status_text = json.dumps(status._json)
                                    status_json = json.loads(status_text)
                                    status_json['keyword'] = keyword"""
                                    f.write("%s," % json.dumps(status._json))
                            max_id = statuses[-1].id
                finally:
                    max_id != 0 and f.seek(f.tell() - 1, os.SEEK_SET)
                    f.write("]}")

    def tw_get_premium_search(self, keyword: str):
        with open(f'datasets/tw_{keyword.lower()}_searches_premium.json',
                  'w') as f:
            try:
                f.write('{"statuses": [')

                rule = gen_rule_payload(
                    pt_rule="near:\"New York, NY\" within:50mi".format(),
                    results_per_call=100,
                    from_date="2018-07-01",
                    to_date="2018-10-01")

                rule = gen_rule_payload(
                    pt_rule="place:\"New York, NY\"".format(),
                    results_per_call=100,
                    from_date=(datetime.date.today() -
                               datetime.timedelta(31)).isoformat(),
                    to_date=datetime.date.today().isoformat())

                next_token = None
                while True:
                    results = ResultStream(rule_payload=rule,
                                           **self.twitter_premium_api)
                    results.next_token = next_token

                    tweets = []

                    try:
                        tweets = list(results.stream())
                    except Exception as ex:
                        print(str(ex))

                    for tweet in tweets:
                        f.write("%s," % json.dumps(tweet))

                    if results.next_token is None:
                        break
                    else:
                        next_token = results.next_token

                next_token is not None and f.seek(f.tell() - 1, os.SEEK_SET)
                f.write("]}")

            except Exception as ex:
                print("Error:\n" + str(ex))

    def yp_get_businesses(self, business_list):
        """
        Get reviews for each business in the business_list and creates separate data files.
        File Type: JSON
        """
        for business in business_list:
            with open(f'{self.data_path}/yp_{business}_competitors.json',
                      'w') as f:
                try:
                    f.write('{"businesses": [')
                    branch = self.yelp_api.business_query(business)
                    offset = 0
                    while (True):
                        try:
                            # status scheme available at: # https://www.yelp.com/developers/documentation/v3/business_search
                            competitors = self.yelp_api.search_query(
                                longitude=branch['coordinates']['longitude'],
                                latitude=branch['coordinates']['latitude'],
                                radius=40000,
                                # categories='bars,french'
                                sort_by='distance',
                                limit=50,
                                offset=offset)

                            f.write("%s," %
                                    json.dumps(competitors['businesses']))
                            offset = offset + 50
                        except self.yelp_api.YelpAPIError:
                            break
                finally:
                    offset != 0 and f.seek(f.tell() - 1, os.SEEK_SET)
                    f.write("]}")

    def yp_get_competitors(self, business_list):
        """
        Gets business list in consideration to the existing business list file. Adds any additional business, if it is not recorded yet.
        """
        file_path = fp.yp_raw_competitors(self.data_path)
        index_list = []
        existing_list = []
        """
        if os.path.exists(file_path):
            with open(file_path, 'r') as f:
                current_file = f.readlines()
                if len(current_file) > 0:
                    existing_list = json.loads(current_file[0])
                    index_list = [_business["alias"] for _business in existing_list]
                    logger.info(f"existing file found: {len(index_list)} total entries")
        """
        with open(file_path, 'w') as f:
            # find businesses
            for business in business_list:
                new_list = []

                try:
                    logger.info(f"import started for : {business}")
                    branch = self.yelp_api.business_query(business)
                    offset = 0
                    while (True):
                        try:
                            # status scheme available at: # https://www.yelp.com/developers/documentation/v3/business_search
                            competitors = self.yelp_api.search_query(
                                longitude=branch['coordinates']['longitude'],
                                latitude=branch['coordinates']['latitude'],
                                radius=40000,
                                # categories='bars,french'
                                sort_by='distance',
                                limit=50,
                                offset=offset)

                            # add alias name for distance measurement as dist_to_alias
                            businesses = competitors["businesses"]
                            [
                                i.update({"dist_to_alias": business})
                                for i in businesses
                            ]

                            for i in businesses:
                                if i['alias'] not in index_list:
                                    new_list.append(i)
                                    index_list.append(i['alias'])

                            offset = offset + 50
                        except self.yelp_api.YelpAPIError:
                            break

                finally:
                    existing_list.extend(new_list)
                    logger.info(
                        f"import completed. existing: {len(existing_list)} new: {len(new_list)}"
                    )

            # saving into file
            json.dump(existing_list, f)

    def yp_get_business_reviews(self, business_list):
        """
        Gets three reviews from the yelp api.
        """
        for business in business_list:
            with open(f'{self.data_path}/yp_{business}_rws.json', 'w') as f:
                try:
                    f.write('{"reviews": [')
                    offset = 0
                    while (True):
                        reviews_set = self.yelp_api.reviews_query(
                            business, limit=5, offset=offset)
                        reviews = reviews_set['reviews']
                        if len(reviews) > 0:
                            for review in reviews:
                                f.write("%s,\n" % review)

                            offset = offset + 5
                        else:
                            break
                finally:
                    offset != 0 and f.seek(f.tell() - 1, os.SEEK_SET)
                    f.write("]}")

    def yp_get_competitor_reviews(self,
                                  business_list=None,
                                  start_index=0,
                                  end_index=5):
        """
        Gets reviews by scraping through the site. Reviews are saved by business name and reviews. Uses Competitors reviews file as default file. Given index controls regions of Competitors. 
        business_list: None or List
        start_index: int, interested region's starting index
        end_index: int, interested region's ending index
        File Type: CSV
        """
        file_path = fp.yp_raw_competitors_reviews(self.data_path)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
        columns = [
            'alias', 'ratingValue', 'dataPublished', 'description', 'author'
        ]
        df: pd.DataFrame
        # getting competitors list
        businesses_file_path = fp.yp_raw_competitors(self.data_path)
        businesses_index_list = []

        if os.path.exists(businesses_file_path):
            with open(businesses_file_path, 'r') as f:
                current_file = f.readlines()
                if len(current_file) > 0:
                    businesses_index_list = [
                        _business["alias"]
                        for _business in json.loads(current_file[0])
                    ]

        # needed every time
        if os.path.exists(file_path):
            with open(file_path, 'r') as f:
                df = pd.read_csv(file_path)
                logger.info(
                    f"existing file found. total reviews count: {len(df)}")

        # need only once, if file doesn't exists
        if os.path.exists(file_path) is False:
            with open(file_path, 'w') as f:
                writer = csv.writer(f)
                writer.writerow(columns)
                logger.info("file created at: {}".format(file_path))

        # ops
        with open(file_path, 'a', newline='') as f:
            if business_list is None:
                business_list = businesses_index_list

            current_index = start_index - 1
            for business in business_list[start_index:end_index]:
                cnt_imported = 0
                current_index = current_index + 1
                logger.info(f"index: {current_index} of {end_index - 1}")
                try:
                    writer = csv.writer(f)
                    logger.info(f"import started for : {business}")
                    start = 0
                    cnt_requests = 0
                    while (True):
                        url = '{}/{}?sort_by=date_desc&start={}'.format(
                            'https://www.yelp.com/biz', business, start)
                        response = requests.get(url, headers)

                        soup = BeautifulSoup(response.text, 'html.parser')
                        html_script = soup.findAll(
                            'script', {'type': 'application/ld+json'})[-1]
                        obj = json.loads(html_script.string)

                        reviews = obj['review']
                        if len(reviews) > 0:
                            for review in reviews:
                                data = [
                                    business,
                                    review['reviewRating']['ratingValue'],
                                    review['datePublished'],
                                    review['description'], review['author']
                                ]

                                check = np.array(data, dtype='O')
                                if not (df.values == check).all(1).any():
                                    writer.writerow(data)
                                    cnt_imported = cnt_imported + 1

                            start = start + 20
                            cnt_requests = cnt_requests + 1
                        else:
                            logger.info(
                                f"import completed. total reviews cnt: {cnt_imported} total request cnt: {cnt_requests}"
                            )
                            break
                except Exception as ex:
                    logger.warning(
                        f"error: alias: {business} index: {current_index} total reviews cnt: {cnt_imported}"
                    )
                    logger.warning(f"error message: {ex}")
                    logger.warning("Let me sleep for some time..")
                    second = int(round(random.expovariate(1) * 100))
                    time.sleep(second)
                    logger.warning(
                        f"{second} seconds slept, now back on scrapping..")
                    continue

    def yp_get_business_reviews2(self, business_list):
        """
        Gets reviews by scraping through the site.
        """
        for business in business_list:
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
            }

            with open(f'{self.data_path}/yp_{business}_rws.json', 'w') as f:
                try:
                    f.write('{"reviews": [')
                    start = 0
                    while (True):
                        url = '{}/{}?sort_by=date_desc&start={}'.format(
                            'https://www.yelp.com/biz', business, start)
                        response = requests.get(url, headers)

                        soup = BeautifulSoup(response.text, 'html.parser')
                        html_script = soup.find(
                            'script', {'type': 'application/ld+json'})
                        obj = json.loads(html_script.string)

                        reviews = obj['review']
                        if len(reviews) > 0:
                            for review in reviews:
                                data = {
                                    'ratingValue':
                                    review['reviewRating']['ratingValue'],
                                    'datePublished':
                                    review['datePublished'],
                                    'description':
                                    review['description'],
                                    'author':
                                    review['author']
                                }
                                f.write("%s," % json.dumps(data))
                            start = start + 20
                        else:
                            break
                finally:
                    start != 0 and f.seek(f.tell() - 1, os.SEEK_SET)
                    f.write("]}")

            with open(f'datasets/yp_businesses.json', 'a') as f:
                obj['review'] = []
Example #14
0
from yelpapi import YelpAPI
yelp_api = YelpAPI(
    '9pdRr_PCd8m5NcXB6bb6cdRmKgeXzz-TUrQfdn8lCZrdEVFqMa3YSygop0Mnp-xKUjTzBZv5VsGh3mW8bE_dM5d24Y7u0cDhKMNG40AqRUTYjc1PSlXaWA1ce9UsXXYx'
)

response = yelp_api.search_query(term='olive garden',
                                 location='san jose',
                                 limit='5')

print(yelp_api.reviews_query(id="kyCAxIcS_axZB12EtHV1FA"))
Example #15
0
business_results_final=[]

argparser = argparse.ArgumentParser(description='Example Yelp queries using yelpapi. Visit https://www.yelp.com/developers/v3/manage_app to get the necessary API keys.')
argparser.add_argument('--client_id', type=str, help='Yelp Fusion API client ID')
argparser.add_argument('--client_secret', type=str, help='Yelp Fusion API client secret')
args = argparser.parse_args()

yelp_api = YelpAPI(args.client_id, args.client_secret)



response = yelp_api.search_query(term='restaurant', location='houston, tx', sort_by='rating', limit=50)
with open('yelp_{}_{}.json'.format('Rest1','HOU'),'w') as f:
           json.dump(response, f, indent=5)
print('\n-------------------------------------------------------------------------\n')
for i in range(0,len(response['businesses'])):
	business_id.append(response['businesses'][i]['id'])
for i in range(0,len(business_id)):
		business_results = yelp_api.reviews_query(id=business_id[i])
		business_results_final.append(business_results)
dump1=json.dumps(business_results_final,indent=5)
df = pd.read_json(dump1)
for item in df.values:
	for review in item[0]:
		 review_result.append(repr(review['text'].replace('\n','').encode('utf-8')))
with open("YelpReviews.txt",'w') as out_file:
	for line in review_result:
		out_file.write(line+"\n")
print ("DONE")

    city_name_country = city_name_split[1].rsplit(",", 1)
    city_name = city_name_country[0]
    city_country = city_name_country[1]
    city_key = city_name_split[0]
    cityStr = city_name.split(",")[0]
    city_code = city_key
    print(city_code + " " + cityStr + " " + city_country + " ///" + city_name)

    response = yelp.search_query(term='food', location=city_name, price='1,2,3,4', sort_by='best_match', limit=10)
    data = json.dumps(response)
    restaurants_json = json.loads(data)
    city_dict = {}
    city_dict['businesses'] = []
    for p in restaurants_json['businesses']:
        if ('price' in p):
            review = yelp.reviews_query(id=p['id'], sort_by='rating', limit=3)
            details = yelp.business_query(id=p['id'])
            data = json.dumps(review)
            detaildata = json.dumps(details)
            reviews_json = json.loads(data)
            details_json = json.loads(detaildata)
            reviews_dict = {}
            reviews_dict['reviews'] = []
            for q in reviews_json['reviews']:
                reviews_dict['reviews'].append({
                'user_name' : q['user']['name'],
                'text' : q['text'],
                'stars' : q['rating']
                })
            doc_dict = {}
            doc_dict['name'] = p['name'],
Example #17
0
def Yelp_ScrapeISP(api_key,
                   city_names,
                   business_data='businesses.csv',
                   business_reviews='businesses_reviews.csv'):
    """
    ====================================================================
    Version: 1.0.1
    Date: Tue 24 Nov 2020

    Purposes: Search and save yelp data about internet service providers
    within a region. 

    Input:
        Required:
            api_key = Api key assigned by yelp fusion
            city_names = List of locations for internet service providers
        Opitional:
            business_data = .csv file containing data from previous searches
            business_reviews = .csv file containing reviews of businesses from
                                previous searches

    Output:
        'businesses.csv' containing information about internet 
            service providers

        'businesses_reviews.csv' containing reviews of internet service
            providers in the businesses.csv file

        'cities_list.txt' list of previous cities that have been searched.
            Data will only be extracted for cities that have not been
            previously searched.


    Example: 
    ca_cities_df = pd.read_csv('cal_cities_lat_long.csv')
    ca_cities = ca_cities_df['Name'] + ', CA'

    api_key = XIXIXIXLXJO

    Yelp_ScrapeISP(api_key,ca_cities)


    Author: Jordan Garrett
    [email protected]
    ====================================================================
    """

    data_dir = os.path.join(os.getcwd(), 'Yelp_Data\\')

    #check to see if any cities in the list have previously been searched
    if os.path.exists(data_dir + 'cities_list.txt'):

        try:
            prev_cities = pickle.load(open(data_dir + 'cities_list.txt', 'rb'))
        except EOFError:
            prev_cities = []

        city_names = [city for city in city_names if city not in prev_cities]

        if city_names:
            print(f'Searching Cities: {city_names}')
        else:
            print('All cities have already been searched')
            return

    yelp_api = YelpAPI(api_key)

    # add in pauses to prevent stop errors from too much scraping
    time.sleep(3)

    all_business_df = pd.DataFrame()
    all_reviews_df = pd.DataFrame()

    try:
        failed_searches = pickle.load(
            open(data_dir + 'failed_searches.txt', 'rb'))
    except EOFError:
        failed_searches = []

    n_failed_searches = len(failed_searches)

    for iCity in city_names:

        try:
            # we can play around with the limit and offset parameters
            # to control the number of results and what item to start the pull on
            search_results = yelp_api.search_query(
                term='Internet Service Providers', location=iCity, limit=50)

            time.sleep(3)

            business_df = pd.DataFrame.from_dict(search_results['businesses'])

            # some regions may return empty results
            if business_df.empty:
                print(f'No data from: {iCity}')
                continue

            # drop the phone, display_phone, transactions, is_closed, and image_url columns
            # we shouldn't need them
            unecessary_cols = [
                'phone', 'display_phone', 'transactions', 'is_closed',
                'image_url'
            ]

            business_df2 = business_df.drop(unecessary_cols, 1)

            #loop through businesses
            reviews = dict()

            reviews_df = pd.DataFrame()
            for iBiz, biz_id in enumerate(business_df2.loc[:, 'id']):
                business_name = business_df2['name'][iBiz]

                #can only get 3 reviews through yelp api
                #BUT...we have the url...which means it should be easy to "not legally" scrape
                reviews[business_name] = yelp_api.reviews_query(biz_id)

                # temporary data frame we can use that will be appended to a master one later
                temp_df = pd.DataFrame.from_dict(
                    reviews[business_name]['reviews'])

                temp_df = temp_df.drop('user', 1)

                # add column for ISP provider
                temp_df.insert(0, 'ISP_name', business_name)

                # add column for business id
                temp_df.insert(1, 'business_id', biz_id)

                # add column for business location
                temp_df.insert(
                    6, 'location',
                    str(business_df2[business_df2['id'] == biz_id]
                        ['location'].item()['display_address']))

                temp_df = temp_df.rename(columns={"id": "rev_id"})

                reviews_df = reviews_df.append(temp_df, ignore_index=True)

                all_business_df = all_business_df.append(business_df2,
                                                         ignore_index=True)
                all_reviews_df = all_reviews_df.append(reviews_df,
                                                       ignore_index=True)

            # Save data
            # if no previous files, just save the data. if previous files, append
            if business_data == None and business_reviews == None:
                all_business_df.to_csv(data_dir + 'businesses.csv',
                                       index=False)
                all_reviews_df.to_csv(data_dir + 'businesses_reviews.csv',
                                      index=False)

            else:  #append data to previous loaded files

                prev_business_df = pd.read_csv(data_dir + business_data)

                prev_reviews_df = pd.read_csv(data_dir + business_reviews)

                new_business_df = prev_business_df.append(all_business_df,
                                                          ignore_index=True)
                new_reviews_df = prev_reviews_df.append(all_reviews_df,
                                                        ignore_index=True)

                new_business_df.to_csv(data_dir + 'businesses.csv',
                                       index=False)
                new_reviews_df.to_csv(data_dir + 'businesses_reviews.csv',
                                      index=False)

            # Save previous cities to ensure that we aren't looking at cities previously searched
            pickle.dump(prev_cities + city_names,
                        open(data_dir + "cities_list.txt", "wb"))

        except YelpAPI.YelpAPIError as yelp_error:

            print(str(yelp_error) + '\n')

            if 'ACCESS_LIMIT_REACHED' in str(yelp_error):
                break
            else:
                yelp_error = sys.exc_info()[0]
                print(iCity, yelp_error)

                failed_searches.append(iCity + '\n')
                continue

        except:
            e = sys.exc_info()[0]
            print(iCity, e)

            failed_searches.append(iCity + '\n')
            continue

        # saved new failed searches
        if n_failed_searches < len(failed_searches):
            pickle.dump(failed_searches,
                        open(data_dir + "failed_searches.txt", "wb"))
Example #18
0
search_results = yelp_api.search_query(term='Pizza',
                                       location='Halifax',
                                       sort_by='rating',
                                       limit=50)

businessIds = list()
businessRating = list()
#businessPhotos = list()
#businessReviews = list()

for result in search_results["businesses"]:
    businessIds.append(result["id"])
    businessRating.append(result["rating"])

for businessId in businessIds:
    businessPhotos = yelp_api.business_query(id=businessId)["photos"]
    businessReviews = yelp_api.reviews_query(id=businessId)["reviews"]

    os.mkdir(businessId)
    i = 0
    for photo in businessPhotos:
        savefile = str(businessId + "/" + str(i) + ".jpg")
        urllib.request.urlretrieve(photo, savefile)
        i = i + 1

    i = 0
    for review in businessReviews:
        savefile = str(businessId + "/" + str(i) + ".txt")
        fsavefile = open(savefile, "w")
        fsavefile.write(review["text"])
        i = i + 1