Example #1
0
def extract_full(url_params=url_params, limit=LIMIT):
    url_params['offset'] = 0
    print "Extract in full mode with parameters", url_params
    extracttime = str(datetime.datetime.now())[0:19]
    first_extract = search_parameter(url_params)
    data = json.dumps(first_extract)
    data = json.loads(data)
    total = data['total']
    zipcode = 'Unknown'
    if 'zipcode' in url_params:
        zipcode = url_params['zipcode']
    log_para_list = [{
        'extracttime': extracttime,
        'total': total,
        'location': url_params['location'],
        'zipcode': zipcode
    }]
    mysqldao.insert(db_name, tb_log_zipcode, log_para_list)
    columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode)
    num_extract = total / limit + 1
    data_entry_list = []
    print "Total:", str(total), "extract until 1000 data"
    count = 0
    for i in range(0, num_extract):
        #print "----No."+str(i+1)+" extraction"
        url_params['offset'] = i * OFFSET
        if i == 49:
            break
        else:
            biz_list_i = search_parameter(url_params)[BUSINESS]
            tran_list = json_transform(biz_list_i, columns_list)
            count += load_data_db(db_name, tb_yelp_zipcode, tran_list)
    print "Insert", count, "in the last run"
Example #2
0
def extract_by_phone(phone_list=None):
    print "====Begin extraction by phone."
    columns_list = mysqldao.column_names(db_name, tb_yelp_phone)
    biz_list = []
    if phone_list == None:
        phone_list = []
        temp_phone_list = mysqldao.select(db_name, tb_insp_norm, ['PHONE'])
        for p in temp_phone_list:
            phone_list.append(p[0])
    exist_phone_list = []
    exist_phone_tuple = mysqldao.select_unique_column(db_name, tb_log_phone,
                                                      'phone')
    count = 0
    for phone in phone_list:
        biz_data = {}
        biz_list = []
        phone = str(phone).replace(' ', '').replace('_', '')
        if len(phone) == 11:
            phone = phone[1:]
        if phone not in exist_phone_tuple:
            url_params = {"phone": phone, 'ywsid': 'bxtstnNlHgO8c6W4X2yuYA'}
            biz_data = yelp_api.request(API_HOST,
                                        '/phone_search',
                                        url_params=url_params)[BUSINESS]
            if len(biz_data) != 0:
                data_phone = dic_look_up(biz_data[0], 'phone')
                if data_phone == phone:
                    biz_list.append(biz_data[0])
            print "Phone:", phone, "count", len(biz_list), 'data'
            tran_list = json_transform_phone(biz_list, columns_list)
            load_data_db(db_name, tb_yelp_phone, tran_list)
            extracttime = str(datetime.datetime.now())[0:19]
            log_para_list = [{'extracttime': extracttime, 'phone': phone}]
            mysqldao.insert(db_name, tb_log_phone, log_para_list)
def extract_full(url_params=url_params,limit=LIMIT):
    url_params['offset']=0
    print "Extract in full mode with parameters", url_params
    extracttime = str(datetime.datetime.now())[0:19]
    first_extract = search_parameter(url_params)
    data=json.dumps(first_extract)
    data=json.loads(data)
    total = data['total']
    zipcode='Unknown'
    if 'zipcode' in url_params:
        zipcode=url_params['zipcode']
    log_para_list = [{'extracttime':extracttime, 'total':total, 'location':url_params['location'], 'zipcode':zipcode}]
    mysqldao.insert(db_name,tb_log_zipcode,log_para_list)
    columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode)
    num_extract = total / limit +1
    data_entry_list = []
    print "Total:", str(total), "extract until 1000 data"
    count=0
    for i in range(0, num_extract):
        #print "----No."+str(i+1)+" extraction"
        url_params['offset'] = i*OFFSET
        if i==49:
            break
        else:
            biz_list_i = search_parameter(url_params)[BUSINESS]
            tran_list = json_transform(biz_list_i,columns_list)
            count+=load_data_db(db_name, tb_yelp_zipcode, tran_list)
    print "Insert",count,"in the last run"
def extract_by_phone(phone_list=None):
    print "====Begin extraction by phone."
    columns_list = mysqldao.column_names(db_name, tb_yelp_phone)
    biz_list=[]
    if phone_list==None:
        phone_list=[]
        temp_phone_list=mysqldao.select(db_name, tb_insp_norm, ['PHONE'])
        for p in temp_phone_list:
            phone_list.append(p[0])
    exist_phone_list=[]
    exist_phone_tuple = mysqldao.select_unique_column(db_name,tb_log_phone,'phone')
    count = 0
    for phone in phone_list:
        biz_data={}
        biz_list=[]
        phone=str(phone).replace(' ','').replace('_','')
        if len(phone) ==11:
            phone = phone[1:]
        if phone not in exist_phone_tuple:
            url_params={
            "phone":phone,
            'ywsid':'bxtstnNlHgO8c6W4X2yuYA'
            }
            biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS]
            if len(biz_data) != 0:
                data_phone=dic_look_up(biz_data[0],'phone')
                if data_phone == phone:
                    biz_list.append(biz_data[0])
            print "Phone:",phone,"count",len(biz_list),'data'
            tran_list=json_transform_phone(biz_list,columns_list)
            load_data_db(db_name, tb_yelp_phone, tran_list)
            extracttime = str(datetime.datetime.now())[0:19]
            log_para_list = [{'extracttime':extracttime, 'phone':phone}]
            mysqldao.insert(db_name,tb_log_phone,log_para_list)
priceDescReg='<dd class="nowrap price-description">'
attributeReg='<dt class="attribute-key">$'
endAttributeReg='<div class="media-block clearfix first-to-review ywidget">'

list_attribute=["TakesReservations","Delivery","Take-out","AcceptsCreditCards"
,"GoodForDinner","ParkingStreet","BikeParking","WheelchairAccessible","GoodforKids"
,"GoodforGroups","AttireCasual","AmbienceTrendy","iseLevelAverage"
,"AlcoholFullBar","OutdoorSeating","Wi-Fi","HasTV","DogsAllowed","WaiterService"
,"Caters","price_range","price_description"]


dbname='dwdproject'
tb_yelp_reivew='yelp_review'
tb_yelp_user='******'

columns_tuple=mysqldao.column_names('dwdproject', tb_yelp_reivew)

def target_line_range(linelist, beginReg, endReg):
	begin=0
	end=0
	size=len(linelist)
	for i in xrange(0, size):
		try:
			line=HTMLParser.HTMLParser().unescape(linelist[i]).encode('utf-8')
			if re.search(beginReg, line):
				begin=i
				break
		except:
			q=1
	for i in xrange(0, size):
		try:
endAttributeReg = '<div class="media-block clearfix first-to-review ywidget">'

list_attribute = [
    "TakesReservations", "Delivery", "Take-out", "AcceptsCreditCards",
    "GoodForDinner", "ParkingStreet", "BikeParking", "WheelchairAccessible",
    "GoodforKids", "GoodforGroups", "AttireCasual", "AmbienceTrendy",
    "iseLevelAverage", "AlcoholFullBar", "OutdoorSeating", "Wi-Fi", "HasTV",
    "DogsAllowed", "WaiterService", "Caters", "price_range",
    "price_description"
]

dbname = 'dwdproject'
tb_yelp_reivew = 'yelp_review'
tb_yelp_user = '******'

columns_tuple = mysqldao.column_names('dwdproject', tb_yelp_reivew)


def target_line_range(linelist, beginReg, endReg):
    begin = 0
    end = 0
    size = len(linelist)
    for i in xrange(0, size):
        try:
            line = HTMLParser.HTMLParser().unescape(
                linelist[i]).encode('utf-8')
            if re.search(beginReg, line):
                begin = i
                break
        except:
            q = 1