import ast #Third party modules from flask import Flask, render_template, request import numpy as np from pandas import DataFrame import pandas from sklearn import linear_model from sklearn.externals import joblib from pygeocoder import Geocoder import helper import sql_database import predict_rest db = sql_database.DbAccess('INSIGHT', usr='******') app = Flask(__name__) app.debug = True @app.route("/") def hello(): return render_template('home.html') @app.route("/maps") def maps(): restaurant = request.args.get("restaurant", "") miles = request.args.get("miles", "") zipcode = request.args.get("zipcode", "") return render_template('maps.html', restaurant=restaurant, miles=miles, zipcode=zipcode) @app.route("/restaurant")
def main(): db_mongo = client.yelp_database posts = db_mongo.posts attrs = [ 'Alcohol', 'HasTV', 'NoiseLevel', 'RestaurantsAttire', 'BusinessAcceptsCreditCards', 'Ambience', 'RestaurantsGoodForGroups', 'Caters', 'WiFi', 'RestaurantsReservations', 'RestaurantsTakeOut', 'GoodForKids', 'WheelchairAccessible', 'RestaurantsTableService', 'OutdoorSeating', 'RestaurantsPriceRange2', 'RestaurantsDelivery', 'GoodForMeal', 'BusinessParking' ] db_sql = sql_database.DbAccess('INSIGHT', usr='******') #db_sql.cursor.execute('DROP TABLE IF EXISTS Restaurant;') #Columns = 'ID INTEGER, Name CHAR(80), Street CHAR(80), City CHAR(40), State CHAR(10), Zip CHAR(10), FullName CHAR(200) NOT NULL PRIMARY KEY, ' #Columns += 'Phone CHAR(50), Site CHAR(100), PictureUrl CHAR(150), Rating FLOAT, Favorites CHAR(200)' #Columns += ', RestaurantType CHAR(200), Latitude FLOAT, Longitude FLOAT, SimilarRest1 CHAR(100), SimilarRest2 CHAR(100), SimilarRest3 CHAR(100), NReviews INT, Review LONGTEXT' #for attr in attrs: #Columns += ', ' + attr + ' CHAR(80)' #db_sql.cursor.execute('CREATE TABLE Restaurant (' + Columns + ');') count = 0 rests_info = posts.find({"added_sql": False}) for rest_info in rests_info: if count % 100 == 0: print count count += 1 n_reviews = rest_info['reviews'] if n_reviews < 40: continue if 'yelp_page' not in rest_info: continue page = rest_info['yelp_page'] soup = BeautifulSoup(page) restaurant = get_restaurant(soup) divs = soup.find_all('div') new_info = {} bizRating = [div for div in divs if div.get("id") == "bizRating"] #Fails if page essentially has no info if len(bizRating) == 0: continue new_info["Rating"] = bizRating[0].meta['content'] #Get name and address h1s = soup.find_all("h1") name = h1s[0].contents[0].strip() #Just skip restaurants with chinese characters in their name because I don't feel like dealing with the encoding right now if re.findall(ur'[\u4e00-\u9fff]+', name): continue spans = soup.find_all('span') telephone = get_address(spans, "telephone") street = get_address(spans, "streetAddress") city = get_address(spans, "addressLocality") state = get_address(spans, "addressRegion") zipcode = get_address(spans, "postalCode") full_name = name + ' ' + street + ' ' + city + ', ' + state + ' ' + zipcode picture_url = 'NULL' picture_div = [ div for div in divs if div.get("class") and len(div.get("class")) > 1 and div.get("class")[1] == "biz-photo-box" ] if len(picture_div) > 0: picture_url = picture_div[0].img['src'] #Get ngrams from snippets review_snippets = [ div for div in divs if div.get("class") and len(div.get("class")) > 1 and div.get("class")[0] == "media-story" and div.get("class")[1] == "snippet" ] ngrams = [] for snippet in review_snippets: for object in snippet.contents: try: if object.get('ngram'): ngrams.append(object.get('ngram')) except: continue new_info["Ngrams"] = ngrams #Grab all the header data on the restaurant page dds = soup.find_all('dd') for dd in dds: content = dd.contents[0] #Convert price to a scale from 1 to 4 if dd['class'][0].find("RestaurantsPriceRange2") != -1: content = 4 - len(dd.span.span['data-remainder']) new_info[str(dd['class'][0])[5:]] = str(content) #Get the type of the restaurant bizInfo = [div for div in divs if div.get("id") == "bizInfoContent"] bizInfo_spans = bizInfo[0].find_all('span') category = [ span for span in bizInfo_spans if span.get('id') == "cat_display" ][0] restaurant_type = [ content.contents[0].lstrip() for content in category.contents if hasattr(content, 'contents') ] new_info['restaurant_type'] = restaurant_type #Get similar restaurants rec_bizs = [div for div in divs if div.get("id") == "bizSimilarBox"] if len(rec_bizs) == 0: continue rec_links = [ rec_biz['href'] for rec_biz in rec_bizs[0].find_all('a') if str(rec_biz.get("id")).find("bizreclink") != -1 ] if len(rec_links) < 3: continue #Get latitude and longitude from map imgs = soup.find_all('img') if len([ img['src'] for img in imgs if img.get("alt") == "Map of Business" ]) == 0: continue map = [ img['src'] for img in imgs if img.get("alt") == "Map of Business" ][0] lat = map[map.find('center') + 7:map.find("%2C")] long = map[map.find('%2C') + 3:map.find("&language")] review = clean_review(get_reviews(soup)).encode('ascii', errors='ignore') for i in range(1, 5): if ('yelp_page' + str(i)) in rest_info: page = rest_info['yelp_page' + str(i)] soup = BeautifulSoup(page) review += clean_review(get_reviews(soup)).encode( 'ascii', errors='ignore') #Now insert all this information into SQL Values = 'INSERT INTO Restaurant (ID, Name, Street, City, State, Zip, FullName, Phone, Site, PictureUrl, Rating, Favorites, RestaurantType' Values += ', Latitude, Longitude, SimilarRest1, SimilarRest2, SimilarRest3, NReviews, Review' for attr in attrs: Values += ', ' + attr Values += ') VALUES (' + str( count ) + ', "' + name + '", "' + street + '", "' + city + '", "' + state + '", "' + zipcode + '", "' + full_name + '", "' Values += telephone + '", "' + restaurant + '", "' + picture_url + '", ' Values += bizRating[0].meta["content"] + ', "' + "---".join( ngrams) + '", "' + "---".join(restaurant_type) + '", ' Values += lat.encode('utf-8') + ', ' + long.encode('utf-8') + ', "' Values += rec_links[0].encode('utf-8') + '", "' + rec_links[1].encode( 'utf-8') + '", "' + rec_links[2].encode('utf-8') + '", ' Values += str(n_reviews) + ', "' + review for attr in attrs: if attr not in new_info: new_info[attr] = "NULL" Values += '", "' + new_info[attr] Values += '");' print restaurant #Add restaurant to db. If restaurant `with name already in db use the one with more reviews try: db_sql.cursor.execute(Values) except mysql.connector.IntegrityError: print "Caught exception" sql = ('SELECT NReviews FROM Restaurant WHERE FullName = "' + full_name + '";') db_sql.cursor.execute(sql) old_restaurant = db_sql.cursor.fetchall() if len(old_restaurant) > 0: old_nreviews = old_restaurant[0][0] if n_reviews > old_nreviews: db_sql.cursor.execute( 'DELETE FROM Restaurant WHERE FullName = "' + full_name + '";') db_sql.commit() db_sql.cursor.execute(Values) db_mongo.posts.update({'restaurant': restaurant}, {"$set": { "added_sql": True }}, True) rest_info['added_sql'] = True db_sql.commit()
#Standard modules from sets import Set #Third party modules from pandas import DataFrame import numpy as np import pandas from sklearn import linear_model from sklearn.externals import joblib from sklearn import cross_validation import helper import sql_database db = sql_database.DbAccess('YELP', usr='******') def main(): df_match = pandas.io.sql.read_frame(''' SELECT r1.RestaurantType as r1Type, r2.RestaurantType as r2Type, ABS(r1.RestaurantsPriceRange2 - r2.RestaurantsPriceRange2) as PriceDiff, ABS(r1.Rating - r2.Rating) as RatingDiff, r1.GoodForMeal=r2.GoodForMeal as MealSame, r1.RestaurantsTableService=r2.RestaurantsTableService as TableSame, r1.Favorites as r1Food, r2.Favorites as r2Food FROM Restaurant r1 JOIN Restaurant r2 ON r1.Site = r2.SimilarRest1 or r1.Site = r2.SimilarRest2 or r1.Site = r2.SimilarRest3;''',db.cnx) df_match['Match'] = np.ones(len(df_match)) df_nomatch = pandas.io.sql.read_frame(''' SELECT r1.RestaurantType as r1Type, r2.RestaurantType as r2Type, ABS(r1.RestaurantsPriceRange2 - r2.RestaurantsPriceRange2) as PriceDiff, ABS(r1.Rating - r2.Rating) as RatingDiff, r1.GoodForMeal=r2.GoodForMeal as MealSame, r1.RestaurantsTableService=r2.RestaurantsTableService as TableSame, r1.Favorites as r1Food, r2.Favorites as r2Food FROM Restaurant r1 JOIN Restaurant r2 ON r1.Site != r2.SimilarRest1 AND r1.Site != r2.SimilarRest2 AND r1.Site != r2.SimilarRest3 AND r1.Site != r2.Site AND ABS(r1.Latitude - r2.Latitude) < 0.0007 and ABS(r1.Longitude - r2.Longitude) < 0.0007 ;''',db.cnx) df_nomatch['Match'] = np.zeros(len(df_nomatch))