def get_review_html(bizid, start):
    url = BIZ_URL + bizid + PAGE_OFFSET + str(start)
    print url
    response = requests.get(url).text.encode("utf-8")
    linelist = str(response).split("\n")
    index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg, yelp_web.endReviewReg)
    reviewlist = linelist[index[0] : index[1]]
    review_dict_list = yelp_web.review_match(reviewlist, bizid)
    extracted = len(review_dict_list)
    if extracted > 0:
        mysqldao.insert(dbname, tb_yelp_review, review_dict_list)
    return extracted
def get_review_html(bizid, start):
    url = BIZ_URL + bizid + PAGE_OFFSET + str(start)
    print url
    response = requests.get(url).text.encode('utf-8')
    linelist = str(response).split("\n")
    index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg,
                                       yelp_web.endReviewReg)
    reviewlist = linelist[index[0]:index[1]]
    review_dict_list = yelp_web.review_match(reviewlist, bizid)
    extracted = len(review_dict_list)
    if extracted > 0:
        mysqldao.insert(dbname, tb_yelp_review, review_dict_list)
    return extracted
Example #3
0
dbname='dwdproject'
tb_yelp_user='******'


USER_URL='http://www.yelp.com/user_details?userid='
target_user_list_query="""
    SELECT userid from dwdproject.log_user
    WHERE userid not in (select userid from dwdproject.yelp_user)
    """
target_user_list_query2="""SELECT userid, count(userid)  num 
from dwdproject.yelp_review
WHERE userid not in
(select userid from dwdproject.yelp_user)
group by userid
ORDER by num desc
"""
#target_user='******'
print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
target_user_list=mysqldao.execute_query(target_user_list_query)
for user in target_user_list:
	userid = user[0]
	url=USER_URL+str(userid)
	response=requests.get(url).text.encode('utf-8')
	linelist=str(response).split("\n")
	index=yelp_web.target_line_range(linelist, yelp_web.beginUserAttrReg, yelp_web.endUserAttrReg)
	attr_dict=yelp_web.user_attribute(linelist[index[0]:index[1]])
	attr_dict['userid']=userid
	mysqldao.insert(dbname,tb_yelp_user,[attr_dict])
	print "=======Insert user", userid

import yelp_etl
import mysqldao
import requests

BIZ_URL = "http://www.yelp.com/biz/"

keyword = 'price_range'
all_biz_id = mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,
                             ['id'])
list_biz_id = mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,
                              ['id'], [keyword], [{
                                  keyword: ''
                              }])

left_biz = len(list_biz_id)
total_biz = len(all_biz_id)
count = 0
for biz in list_biz_id:
    bizid = biz[0]
    url = BIZ_URL + bizid
    response = requests.get(url).text.encode('utf-8')
    linelist = str(response).split("\n")
    index = yelp_web.target_line_range(linelist, yelp_web.beginAttributeReg,
                                       yelp_web.endAttributeReg)
    attr_dict = yelp_web.attribute_match(linelist[index[0]:index[1]])
    attr_dict['id'] = bizid
    mysqldao.update(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,\
     yelp_web.list_attribute, ['id'], [attr_dict])
    count += 1
    print bizid, 'updated', "left ", left_biz - count
import yelp_web
import yelp_etl
import mysqldao
import requests

BIZ_URL="http://www.yelp.com/biz/"

keyword='price_range'
all_biz_id=mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id'])
list_biz_id=mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id'], [keyword], [{keyword:''}])

left_biz=len(list_biz_id)
total_biz=len(all_biz_id)
count = 0
for biz in list_biz_id:
	bizid=biz[0]
	url=BIZ_URL+bizid
	response=requests.get(url).text.encode('utf-8')
	linelist=str(response).split("\n")
	index=yelp_web.target_line_range(linelist, yelp_web.beginAttributeReg, yelp_web.endAttributeReg)
	attr_dict=yelp_web.attribute_match(linelist[index[0]:index[1]])
	attr_dict['id']=bizid
	mysqldao.update(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,\
		yelp_web.list_attribute, ['id'], [attr_dict])
	count+=1
	print bizid, 'updated', "left ", left_biz-count