def get_review_html(bizid, start): url = BIZ_URL + bizid + PAGE_OFFSET + str(start) print url response = requests.get(url).text.encode("utf-8") linelist = str(response).split("\n") index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg, yelp_web.endReviewReg) reviewlist = linelist[index[0] : index[1]] review_dict_list = yelp_web.review_match(reviewlist, bizid) extracted = len(review_dict_list) if extracted > 0: mysqldao.insert(dbname, tb_yelp_review, review_dict_list) return extracted
def get_review_html(bizid, start): url = BIZ_URL + bizid + PAGE_OFFSET + str(start) print url response = requests.get(url).text.encode('utf-8') linelist = str(response).split("\n") index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg, yelp_web.endReviewReg) reviewlist = linelist[index[0]:index[1]] review_dict_list = yelp_web.review_match(reviewlist, bizid) extracted = len(review_dict_list) if extracted > 0: mysqldao.insert(dbname, tb_yelp_review, review_dict_list) return extracted
dbname='dwdproject' tb_yelp_user='******' USER_URL='http://www.yelp.com/user_details?userid=' target_user_list_query=""" SELECT userid from dwdproject.log_user WHERE userid not in (select userid from dwdproject.yelp_user) """ target_user_list_query2="""SELECT userid, count(userid) num from dwdproject.yelp_review WHERE userid not in (select userid from dwdproject.yelp_user) group by userid ORDER by num desc """ #target_user='******' print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" target_user_list=mysqldao.execute_query(target_user_list_query) for user in target_user_list: userid = user[0] url=USER_URL+str(userid) response=requests.get(url).text.encode('utf-8') linelist=str(response).split("\n") index=yelp_web.target_line_range(linelist, yelp_web.beginUserAttrReg, yelp_web.endUserAttrReg) attr_dict=yelp_web.user_attribute(linelist[index[0]:index[1]]) attr_dict['userid']=userid mysqldao.insert(dbname,tb_yelp_user,[attr_dict]) print "=======Insert user", userid
import yelp_etl import mysqldao import requests BIZ_URL = "http://www.yelp.com/biz/" keyword = 'price_range' all_biz_id = mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id']) list_biz_id = mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id'], [keyword], [{ keyword: '' }]) left_biz = len(list_biz_id) total_biz = len(all_biz_id) count = 0 for biz in list_biz_id: bizid = biz[0] url = BIZ_URL + bizid response = requests.get(url).text.encode('utf-8') linelist = str(response).split("\n") index = yelp_web.target_line_range(linelist, yelp_web.beginAttributeReg, yelp_web.endAttributeReg) attr_dict = yelp_web.attribute_match(linelist[index[0]:index[1]]) attr_dict['id'] = bizid mysqldao.update(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,\ yelp_web.list_attribute, ['id'], [attr_dict]) count += 1 print bizid, 'updated', "left ", left_biz - count
import yelp_web import yelp_etl import mysqldao import requests BIZ_URL="http://www.yelp.com/biz/" keyword='price_range' all_biz_id=mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id']) list_biz_id=mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id'], [keyword], [{keyword:''}]) left_biz=len(list_biz_id) total_biz=len(all_biz_id) count = 0 for biz in list_biz_id: bizid=biz[0] url=BIZ_URL+bizid response=requests.get(url).text.encode('utf-8') linelist=str(response).split("\n") index=yelp_web.target_line_range(linelist, yelp_web.beginAttributeReg, yelp_web.endAttributeReg) attr_dict=yelp_web.attribute_match(linelist[index[0]:index[1]]) attr_dict['id']=bizid mysqldao.update(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,\ yelp_web.list_attribute, ['id'], [attr_dict]) count+=1 print bizid, 'updated', "left ", left_biz-count