def extract_full(url_params=url_params, limit=LIMIT): url_params['offset'] = 0 print "Extract in full mode with parameters", url_params extracttime = str(datetime.datetime.now())[0:19] first_extract = search_parameter(url_params) data = json.dumps(first_extract) data = json.loads(data) total = data['total'] zipcode = 'Unknown' if 'zipcode' in url_params: zipcode = url_params['zipcode'] log_para_list = [{ 'extracttime': extracttime, 'total': total, 'location': url_params['location'], 'zipcode': zipcode }] mysqldao.insert(db_name, tb_log_zipcode, log_para_list) columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode) num_extract = total / limit + 1 data_entry_list = [] print "Total:", str(total), "extract until 1000 data" count = 0 for i in range(0, num_extract): #print "----No."+str(i+1)+" extraction" url_params['offset'] = i * OFFSET if i == 49: break else: biz_list_i = search_parameter(url_params)[BUSINESS] tran_list = json_transform(biz_list_i, columns_list) count += load_data_db(db_name, tb_yelp_zipcode, tran_list) print "Insert", count, "in the last run"
def extract_by_phone(phone_list=None): print "====Begin extraction by phone." columns_list = mysqldao.column_names(db_name, tb_yelp_phone) biz_list = [] if phone_list == None: phone_list = [] temp_phone_list = mysqldao.select(db_name, tb_insp_norm, ['PHONE']) for p in temp_phone_list: phone_list.append(p[0]) exist_phone_list = [] exist_phone_tuple = mysqldao.select_unique_column(db_name, tb_log_phone, 'phone') count = 0 for phone in phone_list: biz_data = {} biz_list = [] phone = str(phone).replace(' ', '').replace('_', '') if len(phone) == 11: phone = phone[1:] if phone not in exist_phone_tuple: url_params = {"phone": phone, 'ywsid': 'bxtstnNlHgO8c6W4X2yuYA'} biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS] if len(biz_data) != 0: data_phone = dic_look_up(biz_data[0], 'phone') if data_phone == phone: biz_list.append(biz_data[0]) print "Phone:", phone, "count", len(biz_list), 'data' tran_list = json_transform_phone(biz_list, columns_list) load_data_db(db_name, tb_yelp_phone, tran_list) extracttime = str(datetime.datetime.now())[0:19] log_para_list = [{'extracttime': extracttime, 'phone': phone}] mysqldao.insert(db_name, tb_log_phone, log_para_list)
def extract_full(url_params=url_params,limit=LIMIT): url_params['offset']=0 print "Extract in full mode with parameters", url_params extracttime = str(datetime.datetime.now())[0:19] first_extract = search_parameter(url_params) data=json.dumps(first_extract) data=json.loads(data) total = data['total'] zipcode='Unknown' if 'zipcode' in url_params: zipcode=url_params['zipcode'] log_para_list = [{'extracttime':extracttime, 'total':total, 'location':url_params['location'], 'zipcode':zipcode}] mysqldao.insert(db_name,tb_log_zipcode,log_para_list) columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode) num_extract = total / limit +1 data_entry_list = [] print "Total:", str(total), "extract until 1000 data" count=0 for i in range(0, num_extract): #print "----No."+str(i+1)+" extraction" url_params['offset'] = i*OFFSET if i==49: break else: biz_list_i = search_parameter(url_params)[BUSINESS] tran_list = json_transform(biz_list_i,columns_list) count+=load_data_db(db_name, tb_yelp_zipcode, tran_list) print "Insert",count,"in the last run"
def extract_by_phone(phone_list=None): print "====Begin extraction by phone." columns_list = mysqldao.column_names(db_name, tb_yelp_phone) biz_list=[] if phone_list==None: phone_list=[] temp_phone_list=mysqldao.select(db_name, tb_insp_norm, ['PHONE']) for p in temp_phone_list: phone_list.append(p[0]) exist_phone_list=[] exist_phone_tuple = mysqldao.select_unique_column(db_name,tb_log_phone,'phone') count = 0 for phone in phone_list: biz_data={} biz_list=[] phone=str(phone).replace(' ','').replace('_','') if len(phone) ==11: phone = phone[1:] if phone not in exist_phone_tuple: url_params={ "phone":phone, 'ywsid':'bxtstnNlHgO8c6W4X2yuYA' } biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS] if len(biz_data) != 0: data_phone=dic_look_up(biz_data[0],'phone') if data_phone == phone: biz_list.append(biz_data[0]) print "Phone:",phone,"count",len(biz_list),'data' tran_list=json_transform_phone(biz_list,columns_list) load_data_db(db_name, tb_yelp_phone, tran_list) extracttime = str(datetime.datetime.now())[0:19] log_para_list = [{'extracttime':extracttime, 'phone':phone}] mysqldao.insert(db_name,tb_log_phone,log_para_list)
priceDescReg='<dd class="nowrap price-description">' attributeReg='<dt class="attribute-key">$' endAttributeReg='<div class="media-block clearfix first-to-review ywidget">' list_attribute=["TakesReservations","Delivery","Take-out","AcceptsCreditCards" ,"GoodForDinner","ParkingStreet","BikeParking","WheelchairAccessible","GoodforKids" ,"GoodforGroups","AttireCasual","AmbienceTrendy","iseLevelAverage" ,"AlcoholFullBar","OutdoorSeating","Wi-Fi","HasTV","DogsAllowed","WaiterService" ,"Caters","price_range","price_description"] dbname='dwdproject' tb_yelp_reivew='yelp_review' tb_yelp_user='******' columns_tuple=mysqldao.column_names('dwdproject', tb_yelp_reivew) def target_line_range(linelist, beginReg, endReg): begin=0 end=0 size=len(linelist) for i in xrange(0, size): try: line=HTMLParser.HTMLParser().unescape(linelist[i]).encode('utf-8') if re.search(beginReg, line): begin=i break except: q=1 for i in xrange(0, size): try:
endAttributeReg = '<div class="media-block clearfix first-to-review ywidget">' list_attribute = [ "TakesReservations", "Delivery", "Take-out", "AcceptsCreditCards", "GoodForDinner", "ParkingStreet", "BikeParking", "WheelchairAccessible", "GoodforKids", "GoodforGroups", "AttireCasual", "AmbienceTrendy", "iseLevelAverage", "AlcoholFullBar", "OutdoorSeating", "Wi-Fi", "HasTV", "DogsAllowed", "WaiterService", "Caters", "price_range", "price_description" ] dbname = 'dwdproject' tb_yelp_reivew = 'yelp_review' tb_yelp_user = '******' columns_tuple = mysqldao.column_names('dwdproject', tb_yelp_reivew) def target_line_range(linelist, beginReg, endReg): begin = 0 end = 0 size = len(linelist) for i in xrange(0, size): try: line = HTMLParser.HTMLParser().unescape( linelist[i]).encode('utf-8') if re.search(beginReg, line): begin = i break except: q = 1