def update_hotel_reviewer_score(path): driver = get_webdriver() result = OrderedDict() xls = get_data(path) data = json.dumps(xls, default=json_serial) lines = json.loads(data, object_hook=json_util.object_hook)['hotel_info'] reviews = json.loads(data, object_hook=json_util.object_hook)['review_info'] reviewers = json.loads(data, object_hook=json_util.object_hook)['reviewer_info'] for index, reviewer in enumerate(reviewers[1:]): print reviewer while len(reviewer) < 18: reviewer.append('') if True: #reviewer[11]!='' and reviewer[12]!='': link = reviewer[2] # print reviewer print link try: scores = get_reviewer_scores(driver, link) print scores # break reviewer[11] = scores[0] reviewer[12] = scores[1] reviewer[13] = scores[2] reviewer[14] = scores[3] reviewer[15] = scores[4] except: pass result.update({"hotel_info": lines}) result.update({"reviewer_info": reviewers}) result.update({"review_info": reviews}) save_data(path, result)
def crawl_hotel_reviewer_name(path, reviewer_path): print path result = OrderedDict() xls = get_data(path) data = json.dumps(xls, default=json_serial) # print data lines = json.loads(data, object_hook=json_util.object_hook)['hotel_info'] reviews = json.loads(data, object_hook=json_util.object_hook)['review_info'] # reviewer_result = OrderedDict() # reviewer_xls = get_data(reviewer_path) # reviewer_data = json.dumps(reviewer_xls, default=json_serial) # reviewers=json.loads(reviewer_data, object_hook=json_util.object_hook)['reviewer_info'] # reviewers_id_set=[reviewer[1] for reviewer in reviewers[1:] if len(reviewer) > 1] reviewers_id_set = [] # print reviewers_id_set # exit(0) driver = get_webdriver() for index, review in enumerate(reviews[1:]): print index # review[5],review[8] try: url = review[5] reviewer_data = hotel_reviewer_name_crawler( driver, url, reviewers_id_set) reviewer_id = reviewer_data[0] reviewer_name = reviewer_data[1] reviewer_exist = reviewer_data[2] print "Get reviewer: ", reviewer_name # if not reviewer_id or not reviewer_name: # print 'Passed' # continue reviews[index + 1][1] = reviewer_id reviews[index + 1][2] = reviewer_name if not reviewer_exist: print "Add new reviewer: ", reviewer_id, reviewer_name temp = [] temp.append(reviewer_id) temp.append(reviewer_name) temp.append(url) # print len(reviewers) # reviewers.append(temp) # print len(reviewers) except Exception as e: print e driver = get_new_webdriver(driver) closeDriver(driver) result.update({"hotel_info": lines}) result.update({"review_info": reviews}) save_data(path, result)
def get_hotel_tripadvisor_reviews(path, output_path=None): if not output_path: output_path = path content = read_xls(path) sheets = content.keys() if 'review_info' in sheets: content['review_info'] = content['review_info'][:1] reviews = content['review_info'] if 'hotel_info' in sheets: hotel_info = content['hotel_info'] try: for line in hotel_info[1:]: driver = get_webdriver() hotel_id = line[0] hotel_link = line[5] print hotel_id, hotel_link try: reviews = hotel_reviews_crawler(hotel_link, hotel_id, reviews, driver) driver.close() driver.quit() except Exception as e: print line print e # break if len(content['review_info']) > 1: write_xls(output_path, content) return True except Exception as e: print e return False
def update_hotel_missed_reviewer(reviewer_path): reviewer_result = OrderedDict() reviewer_xls = get_data(reviewer_path) reviewer_data = json.dumps(reviewer_xls, default=json_serial) reviewers = json.loads(reviewer_data, object_hook=json_util.object_hook)['reviewer_info'] # print reviewers[0] # driver = get_webdriver() for i, reviewer in enumerate(reviewers): try: print i reviewer_id = reviewer[0] if len(reviewer) > 2 and (not reviewer[2] or len(reviewer[2]) == 0): review_url = reviewer[-1] info = get_reviewer_profile_link(review_url, get_webdriver(), reviewer[1]) temp = [] temp.append(reviewer_id) temp.append(info.get('reviewer_name', '')) temp.append(info.get('reviewer_link', '')) temp.append(info.get('reviewer_location', '')) temp.append(info.get('reviewer_level', '')) temp.append(info.get('reviewer_num_reviews', '')) temp.append(info.get('reviewer_num_hotel_reviews', '')) temp.append(info.get('reviewer_num_helpful_votes', '')) temp.append(info.get('reviewer_firstmonth', '')) temp.append(info.get('reviewer_gender', '')) temp.append(info.get('reviewer_age', '')) temp.append(info.get('reviewer_num_1', '')) temp.append(info.get('reviewer_num_2', '')) temp.append(info.get('reviewer_num_3', '')) temp.append(info.get('reviewer_num_4', '')) temp.append(info.get('reviewer_num_5', '')) temp.append(info.get('reviewer_description', '')) temp.append(info.get('reviewer_readership', '')) temp.append(info.get('review_url', '')) # print temp reviewers[i] = temp[:] except Exception as e: print e # driver.close() reviewer_result.update({"reviewer_info": reviewers}) save_data(reviewer_path, reviewer_result)
def update_hotel_reviewer(path, reviewer_path): print path result = OrderedDict() xls = get_data(path) data = json.dumps(xls, default=json_serial) # print data lines = json.loads(data, object_hook=json_util.object_hook)['hotel_info'] reviews = json.loads(data, object_hook=json_util.object_hook)['review_info'] reviewer_result = OrderedDict() reviewer_xls = get_data(reviewer_path) reviewer_data = json.dumps(reviewer_xls, default=json_serial) reviewers = json.loads(reviewer_data, object_hook=json_util.object_hook)['reviewer_info'] reviewers_id_set = [ reviewer[1] for reviewer in reviewers[1:] if len(reviewer) > 1 ] # print reviewers_id_set # exit(0) previous_reviewers_info = [reviewer for reviewer in reviewers[1:]] new_reviewers_info = [] driver = get_webdriver() for index, review in enumerate(reviews[1:]): print index # review[5],review[8] try: reviewer_id, reviewer_name = hotel_reviewer_profile_crawler( driver, review[5], review[8], reviewers_id_set, new_reviewers_info, previous_reviewers_info) print reviewer_id, reviewer_name reviews[index + 1][1] = reviewer_id reviews[index + 1][2] = reviewer_name except Exception as e: print e driver = get_new_webdriver(driver) # break try: driver.close() except: pass for info in new_reviewers_info: # previous_reviewers_id=[reviewer[1] for reviewer in reviewers[1:] if len] added = False try: print info temp = [] temp.append(info.get('Reviewer_ID', '')) temp.append(info.get('reviewer_name', '')) temp.append(info.get('reviewer_link', '')) temp.append(info.get('reviewer_location', '')) temp.append(info.get('reviewer_level', '')) temp.append(info.get('reviewer_num_reviews', '')) temp.append(info.get('reviewer_num_hotel_reviews', '')) temp.append(info.get('reviewer_num_helpful_votes', '')) temp.append(info.get('reviewer_firstmonth', '')) temp.append(info.get('reviewer_gender', '')) temp.append(info.get('reviewer_age', '')) temp.append(info.get('reviewer_num_1', '')) temp.append(info.get('reviewer_num_2', '')) temp.append(info.get('reviewer_num_3', '')) temp.append(info.get('reviewer_num_4', '')) temp.append(info.get('reviewer_num_5', '')) temp.append(info.get('reviewer_description', '')) temp.append(info.get('reviewer_readership', '')) temp.append(info.get('review_url', '')) for i in xrange(len(reviewers)): reviewer = reviewers[i] if reviewer[1] == temp[1]: if reviewer[0] != temp[0]: temp[0] = reviewer[0] reviewers[i] = temp added = True break if not added: reviewers.append(temp) except: pass result.update({"hotel_info": lines}) result.update({"review_info": reviews}) save_data(path, result) reviewer_result.update({"reviewer_info": reviewers}) save_data(reviewer_path, reviewer_result)